In [None]:

# 

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from datetime import datetime
import os

## Configuration
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Corrected thread IDs for Oct 2023 - Oct 2024
HIRING_THREADS = [
    ('2023-10', 37739795),
    ('2023-11', 38099577),
    ('2023-12', 38477631),
    ('2024-01', 38845878),
    ('2024-02', 39217462),
    ('2024-03', 39563824),
    ('2024-04', 39895401),
    ('2024-05', 40224213),
    ('2024-06', 40563768),
    ('2024-07', 40940936),
    ('2024-08', 41286547),
    ('2024-09', 41425540),
    ('2024-10', 41709301),
]

BASE_URL = 'https://news.ycombinator.com/item?id='
HEADERS = {'User-Agent': 'Mozilla/5.0 (Educational Research Project)'}

## Helper Functions

def fetch_thread(thread_id):
    """Fetch a HackerNews thread"""
    url = f'{BASE_URL}{thread_id}'
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f" Error fetching thread {thread_id}: {e}")
        return None

def extract_job_postings(html):
    """Extract TOP-LEVEL job postings only (not replies)"""
    soup = BeautifulSoup(html, 'html.parser')

    # Find the main comment table
    comment_table = soup.find('table', class_='comment-tree')
    if not comment_table:
        print("     Could not find comment table")
        return []

    # Get all table rows
    all_rows = comment_table.find_all('tr', class_='athing comtr')

    jobs = []
    for row in all_rows:
        # KEY FIX: Check indent level - only grab indent=0 (top-level)
        indent_img = row.find('img', attrs={'width': True})
        if indent_img:
            indent_level = int(indent_img.get('width', 0))
            # Skip if this is a reply (indent > 0)
            if indent_level > 0:
                continue

        # Extract comment content
        comment_div = row.find('div', class_='comment')
        if not comment_div:
            continue

        # Get text
        text = comment_div.get_text(separator='\n', strip=True)

        # Filter: Must be substantial (job postings are detailed)
        if len(text) < 150:
            continue

        # Filter: Skip if it looks like a question/reply
        text_lower = text.lower()
        question_indicators = [
            'does anyone know',
            'can someone explain',
            'what do you think',
            'has anyone tried',
            '?'  # Questions usually have question marks
        ]

        # Skip if first 100 chars contain question indicators
        first_part = text[:100].lower()
        if any(indicator in first_part for indicator in question_indicators):
            continue

        # Parse the job posting
        comment_id = row.get('id', '')
        job_data = parse_job_posting(text, comment_id)
        if job_data:
            jobs.append(job_data)

    return jobs

def parse_job_posting(text, comment_id):
    """Parse job posting text into structured data"""

    lines = text.split('\n')
    first_line = lines[0] if lines else ''

    # Extract company name
    company = extract_company(first_line, text)

    # Extract role
    role = extract_role(first_line, text)

    # Keyword detection
    text_lower = text.lower()

    # AI/ML keywords (expanded list)
    ai_keywords = [
        'machine learning', 'ml engineer', 'ai engineer', 'artificial intelligence',
        'deep learning', 'nlp', 'natural language', 'computer vision', 'llm',
        'data scientist', 'pytorch', 'tensorflow', 'generative ai', 'gpt',
        'neural network', 'transformers', 'reinforcement learning', 'ml ops'
    ]
    has_ai_keywords = any(kw in text_lower for kw in ai_keywords)

    # Remote work
    remote_keywords = ['remote', 'work from home', 'wfh', 'distributed', 'anywhere']
    is_remote = any(kw in text_lower for kw in remote_keywords)

    # Location
    location = extract_location(text)

    # Salary
    salary = extract_salary(text)

    # Extract requirements/skills
    requires_python = 'python' in text_lower
    requires_js = any(x in text_lower for x in ['javascript', 'typescript', 'react', 'node.js'])

    return {
        'comment_id': comment_id,
        'company': company,
        'role': role,
        'description': text[:1500],  # First 1500 chars
        'has_ai_keywords': has_ai_keywords,
        'is_remote': is_remote,
        'location': location,
        'salary': salary,
        'requires_python': requires_python,
        'requires_js': requires_js,
        'text_length': len(text)
    }

def extract_company(first_line, full_text):
    """Extract company name from posting"""
    # Common patterns: "Company | Role" or "Company - Role" or just "Company"

    # Try pattern with separator
    for sep in ['|', '-', '–']:
        if sep in first_line:
            company = first_line.split(sep)[0].strip()
            # Clean it up
            company = re.sub(r'\([^)]*\)', '', company)  # Remove parentheses
            company = company.strip()
            if 2 < len(company) < 100:
                return company

    # Take first few words
    words = first_line.split()[:5]
    company = ' '.join(words)
    company = re.sub(r'\([^)]*\)', '', company)
    return company.strip()[:100]

def extract_role(first_line, full_text):
    """Extract job role"""
    # Look after | or -
    for sep in ['|', '-', '–']:
        if sep in first_line:
            parts = first_line.split(sep)
            if len(parts) > 1:
                role = parts[1].strip()
                if 5 < len(role) < 150:
                    return role

    # Look for keywords
    role_patterns = [
        r'(?:seeking|hiring|looking for)[:\s]+([^\n]+?)(?:\n|$)',
        r'(?:position|role)[:\s]+([^\n]+?)(?:\n|$)',
    ]

    for pattern in role_patterns:
        match = re.search(pattern, full_text, re.IGNORECASE)
        if match:
            role = match.group(1).strip()
            if 5 < len(role) < 150:
                return role

    return "Not specified"

def extract_location(text):
    """Extract location"""
    patterns = [
        r'(?:location|based in|office)[:\s]+([^\n|]+?)(?:\n|$)',
        r'\b([A-Z][a-z]+,\s*[A-Z]{2})\b',  # City, ST
        r'\b(San Francisco|New York|Seattle|Austin|Boston|London|Berlin|Toronto)\b'
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group(1).strip()[:100]

    return "Not specified"

def extract_salary(text):
    """Extract salary if mentioned"""
    patterns = [
        r'\$[\d,]+k?(?:\s*[-–]\s*\$?[\d,]+k?)?',
        r'[\d,]+k\s*[-–]\s*[\d,]+k',
        r'€[\d,]+k?(?:\s*[-–]\s*€?[\d,]+k?)?'
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(0)

    return "Not specified"

## Main Scraping Function

def scrape_hiring_thread(month, thread_id):
    """Scrape a single hiring thread"""
    print(f"\n{'='*60}")
    print(f" Scraping {month} (Thread: {thread_id})")
    print(f"{'='*60}")

    html = fetch_thread(thread_id)
    if not html:
        return None

    jobs = extract_job_postings(html)

    if not jobs:
        print(f"  No job postings found")
        return None

    print(f" Extracted {len(jobs)} job postings")

    # Create DataFrame
    df = pd.DataFrame(jobs)
    df['month'] = month
    df['thread_id'] = thread_id
    df['scraped_date'] = datetime.now().strftime('%Y-%m-%d')

    # Save to CSV
    filename = f'data/raw/hn_jobs_{month}.csv'
    df.to_csv(filename, index=False)
    print(f" Saved to {filename}")

    # Show samples
    print(f"\n Sample postings:")
    for idx, row in df.head(3).iterrows():
        ai_flag = "" if row['has_ai_keywords'] else ""
        remote_flag = "" if row['is_remote'] else ""
        print(f"   {ai_flag}{remote_flag} {row['company'][:30]} | {row['role'][:40]}")

    print(f"\n Stats:")
    print(f"   AI/ML roles: {df['has_ai_keywords'].sum()} ({df['has_ai_keywords'].sum()/len(df)*100:.1f}%)")
    print(f"   Remote: {df['is_remote'].sum()} ({df['is_remote'].sum()/len(df)*100:.1f}%)")
    print(f"   Python: {df['requires_python'].sum()}")
    print(f"   JavaScript: {df['requires_js'].sum()}")

    return df

## Execute Scraping

print(" HackerNews Job Scraper - FIXED VERSION")
print("   Only extracts TOP-LEVEL job postings (not replies)")
print(f"\n Target: {len(HIRING_THREADS)} months")
print(f" Range: {HIRING_THREADS[0][0]} → {HIRING_THREADS[-1][0]}")

# Ask for confirmation
print("\nStarting in 3 seconds...")
time.sleep(3)

all_data = []
failed = []

for month, thread_id in HIRING_THREADS:
    df = scrape_hiring_thread(month, thread_id)
    if df is not None:
        all_data.append(df)
    else:
        failed.append(month)

    # Rate limiting
    print("⏱  Waiting 5 seconds...")
    time.sleep(5)

## Final Summary

print("\n" + "="*60)
print(" SCRAPING COMPLETE")
print("="*60)

if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    combined_df.to_csv('data/raw/hn_jobs_combined.csv', index=False)

    print(f"\n FINAL SUMMARY:")
    print(f"   Total postings: {len(combined_df):,}")
    print(f"   Successful months: {len(all_data)}/{len(HIRING_THREADS)}")
    if failed:
        print(f"   Failed months: {', '.join(failed)}")

    print(f"\n Monthly breakdown:")
    monthly = combined_df.groupby('month').size().sort_index()
    for m, count in monthly.items():
        print(f"      {m}: {count:,} postings")

    print(f"\n AI/ML Analysis:")
    ai_count = combined_df['has_ai_keywords'].sum()
    print(f"   AI-related postings: {ai_count:,} ({ai_count/len(combined_df)*100:.1f}%)")

    print(f"\n Remote Work:")
    remote_count = combined_df['is_remote'].sum()
    print(f"   Remote postings: {remote_count:,} ({remote_count/len(combined_df)*100:.1f}%)")

    print(f"\n Output files:")
    print(f"   • data/raw/hn_jobs_YYYY-MM.csv (individual months)")
    print(f"   • data/raw/hn_jobs_combined.csv (all data)")

    print("\n Phase 1 Complete! Ready for Phase 2 (Data Cleaning)")

else:
    print("\n No data collected")
    print("   Check thread IDs and internet connection")

🚀 HackerNews Job Scraper - FIXED VERSION
   Only extracts TOP-LEVEL job postings (not replies)

📊 Target: 13 months
📅 Range: 2023-10 → 2024-10

Starting in 3 seconds...

📅 Scraping 2023-10 (Thread: 37739795)
✅ Extracted 2 job postings
💾 Saved to data/raw/hn_jobs_2023-10.csv

📋 Sample postings:
   💻🏢 I'd argue you're creating a | Not specified
   💻🏢 If the ones who are | Not specified

📊 Stats:
   AI/ML roles: 0 (0.0%)
   Remote: 0 (0.0%)
   Python: 0
   JavaScript: 0
⏱️  Waiting 5 seconds...

📅 Scraping 2023-11 (Thread: 38099577)
⚠️  No job postings found
⏱️  Waiting 5 seconds...

📅 Scraping 2023-12 (Thread: 38477631)
⚠️  No job postings found
⏱️  Waiting 5 seconds...

📅 Scraping 2024-01 (Thread: 38845878)
❌ Error fetching thread 38845878: 403 Client Error: Forbidden for url: https://news.ycombinator.com/item?id=38845878
⏱️  Waiting 5 seconds...

📅 Scraping 2024-02 (Thread: 39217462)
⚠️  No job postings found
⏱️  Waiting 5 seconds...

📅 Scraping 2024-03 (Thread: 39563824)
✅ Extracted 1