# 02 - Adzuna API Data Collection

## Objective
Collect tech job postings from Adzuna API (2023-2024) to enhance our HackerNews dataset.


## Expected Output
- 2,000-5,000 tech job postings
- Focus: AI, ML, Software Engineering roles
- Time period: 2023-2024


## 1. Setup & Configuration


In [1]:
import requests
import pandas as pd
import time
import json
from datetime import datetime, timedelta
import os
from typing import Dict, List, Optional
import re

print(" Libraries imported successfully")


 Libraries imported successfully


In [2]:
# API Configuration, # I know this is not a good practice, but I don't want to pay for a API key, it's just for a project and a free tier api so no big deal, could use .env file if needed and inhance the project further
APP_ID = '4ad8a509'
APP_KEY = '01485b2cbeba2ea07f3ed7182a919f2f'
BASE_URL = 'https://api.adzuna.com/v1/api/jobs/us/search'

# Rate Limiting (Adzuna limits: 25/min, 250/day)
REQUEST_DELAY = 5  # seconds between requests (safe: 12 requests/min)
RESULTS_PER_PAGE = 50  # max 50 per Adzuna docs
MAX_PAGES = 100  # collect up to 5,000 jobs

# Data directories
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# Search keywords for tech jobs
TECH_KEYWORDS = [
    'artificial intelligence',
    'machine learning',
    'data scientist',
    'software engineer',
    'ML engineer',
    'AI engineer',
    'deep learning',
    'Python developer',
    'data engineer',
    'MLOps'
]

print("🔧 Configuration loaded")
print(f"   API Endpoint: {BASE_URL}")
print(f"   Rate limit: {60/REQUEST_DELAY:.0f} requests/min")
print(f"   Target: {MAX_PAGES} pages × {RESULTS_PER_PAGE} = {MAX_PAGES * RESULTS_PER_PAGE:,} jobs")


🔧 Configuration loaded
   API Endpoint: https://api.adzuna.com/v1/api/jobs/us/search
   Rate limit: 12 requests/min
   Target: 100 pages × 50 = 5,000 jobs


## 2. API Client Functions


In [4]:
def fetch_jobs_page(page: int, keyword: str, max_retries: int = 3) -> Optional[Dict]:
    """
    Fetch one page of job listings from Adzuna API.
    
    Args:
        page: Page number (1-indexed)
        keyword: Search keyword
        max_retries: Number of retry attempts on failure
    
    Returns:
        JSON response dict or None on failure
    """
    url = f"{BASE_URL}/{page}"
    
    params = {
        'app_id': APP_ID,
        'app_key': APP_KEY,
        'results_per_page': RESULTS_PER_PAGE,
        'what': keyword,
        'content-type': 'application/json',
        'sort_by': 'date'
    }
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, params=params, timeout=15)
            
            if response.status_code == 200:
                return response.json()
            
            elif response.status_code == 429:
                wait_time = (attempt + 1) * 10
                print(f"     Rate limit hit. Waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
            
            elif response.status_code == 403:
                print(f"    Authentication error (403). Check API credentials.")
                return None
            
            else:
                print(f"     HTTP {response.status_code}: {response.text[:100]}")
                return None
                
        except requests.exceptions.Timeout:
            print(f"     Timeout (attempt {attempt + 1}/{max_retries})")
            time.sleep(5)
            
        except requests.exceptions.RequestException as e:
            print(f"    Request error: {e}")
            return None
    
    print(f"    Failed after {max_retries} attempts")
    return None


def parse_job_data(job: Dict) -> Dict:
    """
    Parse Adzuna job JSON into structured format.
    Aligns schema with HackerNews data for consistency.
    """
    description = job.get('description', '')
    title = job.get('title', '')
    
    full_text = f"{title} {description}".lower()
    
    ai_keywords = [
        'machine learning', 'ml engineer', 'ai engineer', 'artificial intelligence',
        'deep learning', 'nlp', 'natural language', 'computer vision', 'llm',
        'data scientist', 'pytorch', 'tensorflow', 'generative ai', 'gpt',
        'neural network', 'transformers', 'reinforcement learning', 'mlops'
    ]
    has_ai_keywords = any(kw in full_text for kw in ai_keywords)
    
    remote_keywords = ['remote', 'work from home', 'wfh', 'distributed', 'anywhere', 'telecommute']
    is_remote = any(kw in full_text for kw in remote_keywords)
    
    requires_python = 'python' in full_text
    requires_js = any(x in full_text for x in ['javascript', 'typescript', 'react', 'node.js', 'angular', 'vue'])
    
    salary_min = job.get('salary_min', None)
    salary_max = job.get('salary_max', None)
    
    if salary_min and salary_max:
        salary = f"${salary_min/1000:.0f}k-${salary_max/1000:.0f}k"
    elif salary_min:
        salary = f"${salary_min/1000:.0f}k+"
    elif salary_max:
        salary = f"up to ${salary_max/1000:.0f}k"
    else:
        salary = "Not specified"
    
    location_data = job.get('location', {})
    if isinstance(location_data, dict):
        area = location_data.get('display_name', '')
        location = area if area else "Not specified"
    else:
        location = str(location_data) if location_data else "Not specified"
    
    return {
        'job_id': job.get('id', ''),
        'company': job.get('company', {}).get('display_name', 'Not specified'),
        'role': title,
        'description': description[:1500],
        'has_ai_keywords': has_ai_keywords,
        'is_remote': is_remote,
        'location': location,
        'salary': salary,
        'salary_min': salary_min,
        'salary_max': salary_max,
        'requires_python': requires_python,
        'requires_js': requires_js,
        'created_date': job.get('created', ''),
        'redirect_url': job.get('redirect_url', ''),
        'category': job.get('category', {}).get('label', '') if isinstance(job.get('category'), dict) else '',
        'contract_type': job.get('contract_type', ''),
        'text_length': len(description),
        'source': 'adzuna',
        'scraped_date': datetime.now().strftime('%Y-%m-%d')
    }


def filter_date_range(job: Dict, start_date: str = '2023-01-01', end_date: str = '2024-12-31') -> bool:
    """
    Check if job posting is within target date range.
    """
    created = job.get('created_date', '')
    if not created:
        return False
    
    try:
        job_date = datetime.fromisoformat(created.replace('Z', '+00:00'))
        start = datetime.fromisoformat(start_date)
        end = datetime.fromisoformat(end_date)
        return start <= job_date <= end
    except:
        return True


print(" API client functions defined")


 API client functions defined


## 3. Data Collection

This will take approximately 8-10 minutes for 100 pages (5,000 jobs).


In [5]:
def collect_jobs(keywords: List[str], max_pages_per_keyword: int = 20) -> pd.DataFrame:
    """
    Collect job postings for multiple keywords.
    
    Args:
        keywords: List of search terms
        max_pages_per_keyword: Max pages to fetch per keyword
    
    Returns:
        DataFrame with all collected jobs
    """
    all_jobs = []
    seen_ids = set()
    
    print("\n" + "="*70)
    print(" Starting Adzuna API Data Collection")
    print("="*70)
    print(f"\n Configuration:")
    print(f"   Keywords: {len(keywords)}")
    print(f"   Pages per keyword: {max_pages_per_keyword}")
    print(f"   Max total jobs: ~{len(keywords) * max_pages_per_keyword * RESULTS_PER_PAGE:,}")
    print(f"   Estimated time: {len(keywords) * max_pages_per_keyword * REQUEST_DELAY / 60:.1f} minutes\n")
    
    start_time = time.time()
    
    for keyword_idx, keyword in enumerate(keywords, 1):
        print(f"\n{'─'*70}")
        print(f" Keyword {keyword_idx}/{len(keywords)}: '{keyword}'")
        print(f"{'─'*70}")
        
        keyword_jobs = 0
        keyword_duplicates = 0
        
        for page in range(1, max_pages_per_keyword + 1):
            print(f"\n    Page {page}/{max_pages_per_keyword}...", end=' ')
            
            response = fetch_jobs_page(page, keyword)
            
            if not response:
                print("Failed. Skipping.")
                continue
            
            results = response.get('results', [])
            
            if not results:
                print("No more results. Moving to next keyword.")
                break
            
            page_jobs = 0
            for job_raw in results:
                job = parse_job_data(job_raw)
                
                if job['job_id'] in seen_ids:
                    keyword_duplicates += 1
                    continue
                
                if not filter_date_range(job):
                    continue
                
                seen_ids.add(job['job_id'])
                all_jobs.append(job)
                page_jobs += 1
                keyword_jobs += 1
            
            print(f" {page_jobs} jobs (+{keyword_duplicates} dupes)")
            
            if page % 10 == 0 and all_jobs:
                checkpoint_df = pd.DataFrame(all_jobs)
                checkpoint_df.to_csv(f'data/raw/adzuna_checkpoint_{len(all_jobs)}.csv', index=False)
                print(f"       Checkpoint saved: {len(all_jobs)} total jobs")
            
            time.sleep(REQUEST_DELAY)
        
        print(f"\n   ✨ Keyword summary: {keyword_jobs} new jobs, {keyword_duplicates} duplicates")
    
    elapsed = time.time() - start_time
    
    print("\n" + "="*70)
    print(" Collection Complete!")
    print("="*70)
    print(f"   Total unique jobs: {len(all_jobs):,}")
    print(f"   Time elapsed: {elapsed/60:.1f} minutes")
    print(f"   Average rate: {len(all_jobs)/elapsed*60:.1f} jobs/minute")
    
    return pd.DataFrame(all_jobs)


df_adzuna = collect_jobs(TECH_KEYWORDS, max_pages_per_keyword=10)



 Starting Adzuna API Data Collection

 Configuration:
   Keywords: 10
   Pages per keyword: 10
   Max total jobs: ~5,000
   Estimated time: 8.3 minutes


──────────────────────────────────────────────────────────────────────
 Keyword 1/10: 'artificial intelligence'
──────────────────────────────────────────────────────────────────────

    Page 1/10...  50 jobs (+0 dupes)

    Page 2/10...  50 jobs (+0 dupes)

    Page 3/10...  50 jobs (+0 dupes)

    Page 4/10...  50 jobs (+0 dupes)

    Page 5/10...  50 jobs (+0 dupes)

    Page 6/10...  50 jobs (+0 dupes)

    Page 7/10...  50 jobs (+0 dupes)

    Page 8/10...  50 jobs (+0 dupes)

    Page 9/10...  50 jobs (+0 dupes)

    Page 10/10...  50 jobs (+0 dupes)
       Checkpoint saved: 500 total jobs

   ✨ Keyword summary: 500 new jobs, 0 duplicates

──────────────────────────────────────────────────────────────────────
 Keyword 2/10: 'machine learning'
──────────────────────────────────────────────────────────────────────

    Page 1/10

## 4. Data Quality & Validation


In [6]:
print("\n DATA QUALITY REPORT")
print("="*70)

print(f"\n1️  Dataset Overview:")
print(f"   Total jobs collected: {len(df_adzuna):,}")
print(f"   Unique companies: {df_adzuna['company'].nunique():,}")
print(f"   Date range: {df_adzuna['created_date'].min()[:10]} to {df_adzuna['created_date'].max()[:10]}")

print(f"\n2️  Data Completeness:")
print(f"   Jobs with salary info: {(df_adzuna['salary_min'].notna().sum() / len(df_adzuna) * 100):.1f}%")
print(f"   Jobs with location: {((df_adzuna['location'] != 'Not specified').sum() / len(df_adzuna) * 100):.1f}%")
print(f"   Jobs with description: {(df_adzuna['text_length'] > 100).sum() / len(df_adzuna) * 100:.1f}%")

print(f"\n3️  AI/ML Content:")
ai_count = df_adzuna['has_ai_keywords'].sum()
print(f"   AI-related jobs: {ai_count:,} ({ai_count/len(df_adzuna)*100:.1f}%)")
print(f"   Remote jobs: {df_adzuna['is_remote'].sum():,} ({df_adzuna['is_remote'].sum()/len(df_adzuna)*100:.1f}%)")

print(f"\n4️  Tech Stack Requirements:")
print(f"   Python: {df_adzuna['requires_python'].sum():,} ({df_adzuna['requires_python'].sum()/len(df_adzuna)*100:.1f}%)")
print(f"   JavaScript: {df_adzuna['requires_js'].sum():,} ({df_adzuna['requires_js'].sum()/len(df_adzuna)*100:.1f}%)")

print(f"\n5️  Salary Statistics:")
if df_adzuna['salary_min'].notna().any():
    print(f"   Median min salary: ${df_adzuna['salary_min'].median()/1000:.0f}k")
    print(f"   Median max salary: ${df_adzuna['salary_max'].median()/1000:.0f}k")
    print(f"   Avg salary range: ${df_adzuna['salary_min'].mean()/1000:.0f}k - ${df_adzuna['salary_max'].mean()/1000:.0f}k")

print(f"\n6️  Top Hiring Companies:")
top_companies = df_adzuna['company'].value_counts().head(10)
for company, count in top_companies.items():
    print(f"   {company[:40]}: {count} jobs")

print(f"\n7️ Top Locations:")
top_locations = df_adzuna['location'].value_counts().head(10)
for location, count in top_locations.items():
    if location != "Not specified":
        print(f"   {location[:40]}: {count} jobs")



 DATA QUALITY REPORT

1️  Dataset Overview:
   Total jobs collected: 3,691
   Unique companies: 1,424
   Date range: 2025-10-21 to 2025-10-24

2️  Data Completeness:
   Jobs with salary info: 100.0%
   Jobs with location: 100.0%
   Jobs with description: 100.0%

3️  AI/ML Content:
   AI-related jobs: 902 (24.4%)
   Remote jobs: 465 (12.6%)

4️  Tech Stack Requirements:
   Python: 97 (2.6%)
   JavaScript: 21 (0.6%)

5️  Salary Statistics:
   Median min salary: $125k
   Median max salary: $134k
   Avg salary range: $134k - $145k

6️  Top Hiring Companies:
   Oracle: 278 jobs
   Meta: 99 jobs
   Deloitte: 92 jobs
   Nelnet: 76 jobs
   DELOITTE: 65 jobs
   Contact Government Services, LLC: 51 jobs
   Launch Potato: 51 jobs
   Highmark Health: 50 jobs
   BOEING: 49 jobs
   Amazon: 48 jobs

7️ Top Locations:
   US: 202 jobs
   New York City, New York: 128 jobs
   San Francisco, California: 95 jobs
   Boston, Suffolk County: 66 jobs
   Atlanta, Fulton County: 59 jobs
   Chicago, Cook County:

## 5. Save Dataset


In [7]:
output_file = 'data/raw/adzuna_jobs_2023_2024.csv'
df_adzuna.to_csv(output_file, index=False)

print("\n DATASET SAVED")
print("="*70)
print(f"   File: {output_file}")
print(f"   Size: {os.path.getsize(output_file) / 1024 / 1024:.2f} MB")
print(f"   Records: {len(df_adzuna):,}")
print(f"   Columns: {len(df_adzuna.columns)}")

print(f"\n Schema:")
print(df_adzuna.dtypes)



 DATASET SAVED
   File: data/raw/adzuna_jobs_2023_2024.csv
   Size: 2.83 MB
   Records: 3,691
   Columns: 19

 Schema:
job_id              object
company             object
role                object
description         object
has_ai_keywords       bool
is_remote             bool
location            object
salary              object
salary_min         float64
salary_max         float64
requires_python       bool
requires_js           bool
created_date        object
redirect_url        object
category            object
contract_type       object
text_length          int64
source              object
scraped_date        object
dtype: object


## 6. Sample Data Preview


In [8]:
print("\n SAMPLE JOB POSTINGS")
print("="*70)

ai_jobs = df_adzuna[df_adzuna['has_ai_keywords']].head(5)

for idx, job in ai_jobs.iterrows():
    remote = " Remote" if job['is_remote'] else " Onsite"
    print(f"\n{'─'*70}")
    print(f" {job['company']}")
    print(f"   Role: {job['role'][:60]}")
    print(f"   Location: {job['location'][:40]}")
    print(f"   Salary: {job['salary']}")
    print(f"   Type: {remote}")
    print(f"   Posted: {job['created_date'][:10]}")
    print(f"   Tech: {'Python' if job['requires_python'] else ''} {'JavaScript' if job['requires_js'] else ''}")

print("\n\n DataFrame Preview:")
df_adzuna[['company', 'role', 'location', 'salary', 'has_ai_keywords', 'is_remote']].head(10)



 SAMPLE JOB POSTINGS

──────────────────────────────────────────────────────────────────────
 GoPuff
   Role: Retail Sales Associate, Ballard
   Location: Seattle, King County
   Salary: $40k-$40k
   Type:  Onsite
   Posted: 2025-10-24
   Tech:  

──────────────────────────────────────────────────────────────────────
 Pryon
   Role: Machine Learning Engineer - Modeling
   Location: Washington, Washington, D.C.
   Salary: $137k-$137k
   Type:  Onsite
   Posted: 2025-10-24
   Tech:  

──────────────────────────────────────────────────────────────────────
 Chef Robotics
   Role: Senior Robotics Release and Test Engineer
   Location: San Francisco, California
   Salary: $164k-$164k
   Type:  Onsite
   Posted: 2025-10-24
   Tech:  

──────────────────────────────────────────────────────────────────────
 SP+
   Role: Hotel Valet Supervisor - Driving
   Location: Roanoke, Roanoke County
   Salary: $37k-$37k
   Type:  Onsite
   Posted: 2025-10-24
   Tech:  

──────────────────────────────────

Unnamed: 0,company,role,location,salary,has_ai_keywords,is_remote
0,Intermountain Health,Director Nursing Peri-Op Services,"Wheat Ridge, Jefferson County",$159k-$159k,False,False
1,Intermountain Health,Angio Interventionalist,"Glendale, Denver",$100k-$100k,False,False
2,Intermountain Health,Radiology Technologist,"Brighton, Adams County",$81k-$81k,False,False
3,Intermountain Health,MRI Technologist,"Wheat Ridge, Jefferson County",$99k-$99k,False,False
4,Intermountain Health,Radiology Technologist Travel Team,"Lafayette, Boulder County",$91k-$91k,False,False
5,Intermountain Health,"Cardiovascular Technologist EP $15,000 Sign On...","Glendale, Denver",$65k-$65k,False,False
6,Intermountain Health,Registered Nurse Telemetry,"Lafayette, Boulder County",$124k-$124k,False,False
7,Heart and Vascular - Denver - Intermountain He...,Cardio-Thoracic Surgery - Physician,"Glendale, Denver",$135k-$135k,False,False
8,Intermountain Health,CT Technologist Travel Team,"Glendale, Denver",$98k-$98k,False,False
9,Intermountain Health,MRI Technologist,"Boise, Ada County",$123k-$123k,False,False


## 7. Comparison with HackerNews Data


In [9]:
try:
    df_hn = pd.read_csv('data/raw/hn_jobs_combined.csv')

    print("\n DATASET COMPARISON: Adzuna vs HackerNews")
    print("="*70)
    
    comparison = pd.DataFrame({
        'Metric': [
            'Total Jobs',
            'AI/ML Jobs',
            'AI/ML Percentage',
            'Remote Jobs',
            'Remote Percentage',
            'Python Required',
            'JavaScript Required'
        ],
        'Adzuna': [
            f"{len(df_adzuna):,}",
            f"{df_adzuna['has_ai_keywords'].sum():,}",
            f"{df_adzuna['has_ai_keywords'].sum()/len(df_adzuna)*100:.1f}%",
            f"{df_adzuna['is_remote'].sum():,}",
            f"{df_adzuna['is_remote'].sum()/len(df_adzuna)*100:.1f}%",
            f"{df_adzuna['requires_python'].sum():,}",
            f"{df_adzuna['requires_js'].sum():,}"
        ],
        'HackerNews': [
            f"{len(df_hn):,}",
            f"{df_hn['has_ai_keywords'].sum():,}",
            f"{df_hn['has_ai_keywords'].sum()/len(df_hn)*100:.1f}%",
            f"{df_hn['is_remote'].sum():,}",
            f"{df_hn['is_remote'].sum()/len(df_hn)*100:.1f}%",
            f"{df_hn['requires_python'].sum():,}",
            f"{df_hn['requires_js'].sum():,}"
        ]
    })
    
    print(comparison.to_string(index=False))
    
    print("\n Key Insights:")
    print(f"   • Adzuna provides {len(df_adzuna)/len(df_hn):.1f}x more data than HN")
    print(f"   • Adzuna has more structured salary data")
    print(f"   • Combined dataset: {len(df_adzuna) + len(df_hn):,} total job postings")
    print(f"   • Broader validation dataset for trend analysis")
    
except FileNotFoundError:
    print("\n  HN data not found for comparison")



 DATASET COMPARISON: Adzuna vs HackerNews
             Metric Adzuna HackerNews
         Total Jobs  3,691        711
         AI/ML Jobs    902        161
   AI/ML Percentage  24.4%      22.6%
        Remote Jobs    465        430
  Remote Percentage  12.6%      60.5%
    Python Required     97        151
JavaScript Required     21        241

 Key Insights:
   • Adzuna provides 5.2x more data than HN
   • Adzuna has more structured salary data
   • Combined dataset: 4,402 total job postings
   • Broader validation dataset for trend analysis
