# 03 - Data Cleaning & Integration

## Objective
Combine and clean job posting data from HackerNews and Adzuna sources.

## Tasks
- Load all raw CSV files
- Validate data quality
- Concatenate into single DataFrame
- Remove duplicate postings
- Handle missing values
- Standardize date formats

## Expected Output
- Clean, unified dataset: `data/processed/jobs_cleaned.csv`
- Data quality report
- 3,500-4,000 validated job postings


## 1. Environment Setup


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import os
from difflib import SequenceMatcher
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print(" Libraries imported successfully")
print(f"   Pandas version: {pd.__version__}")
print(f"   NumPy version: {np.__version__}")


✅ Libraries imported successfully
   Pandas version: 2.3.3
   NumPy version: 2.3.4


## 2. Load Raw Data


In [2]:
print(" Loading raw data files...\n")

df_adzuna = pd.read_csv('data/raw/adzuna_jobs_2023_2024.csv')
df_hn = pd.read_csv('data/raw/hn_jobs_combined.csv')

print("="*70)
print(" RAW DATA LOADED")
print("="*70)

print(f"\n Adzuna Dataset:")
print(f"   Records: {len(df_adzuna):,}")
print(f"   Columns: {len(df_adzuna.columns)}")
print(f"   Memory: {df_adzuna.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\n HackerNews Dataset:")
print(f"   Records: {len(df_hn):,}")
print(f"   Columns: {len(df_hn.columns)}")
print(f"   Memory: {df_hn.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\n Combined (before cleaning): {len(df_adzuna) + len(df_hn):,} records")


 Loading raw data files...

 RAW DATA LOADED

 Adzuna Dataset:
   Records: 3,691
   Columns: 19
   Memory: 6.35 MB

 HackerNews Dataset:
   Records: 711
   Columns: 14
   Memory: 1.22 MB

 Combined (before cleaning): 4,402 records


In [3]:
print("\n Schema Comparison:\n")

print("Adzuna columns:")
print(df_adzuna.columns.tolist())

print("\nHackerNews columns:")
print(df_hn.columns.tolist())

print("\nCommon columns:")
common = set(df_adzuna.columns) & set(df_hn.columns)
print(sorted(common))

print("\nAdzuna-only columns:")
print(sorted(set(df_adzuna.columns) - set(df_hn.columns)))

print("\nHackerNews-only columns:")
print(sorted(set(df_hn.columns) - set(df_adzuna.columns)))



 Schema Comparison:

Adzuna columns:
['job_id', 'company', 'role', 'description', 'has_ai_keywords', 'is_remote', 'location', 'salary', 'salary_min', 'salary_max', 'requires_python', 'requires_js', 'created_date', 'redirect_url', 'category', 'contract_type', 'text_length', 'source', 'scraped_date']

HackerNews columns:
['comment_id', 'company', 'role', 'description', 'has_ai_keywords', 'is_remote', 'location', 'salary', 'requires_python', 'requires_js', 'text_length', 'month', 'thread_id', 'scraped_date']

Common columns:
['company', 'description', 'has_ai_keywords', 'is_remote', 'location', 'requires_js', 'requires_python', 'role', 'salary', 'scraped_date', 'text_length']

Adzuna-only columns:
['category', 'contract_type', 'created_date', 'job_id', 'redirect_url', 'salary_max', 'salary_min', 'source']

HackerNews-only columns:
['comment_id', 'month', 'thread_id']


## 3. Data Quality Assessment


In [7]:
def assess_data_quality(df, name):
    """Generate comprehensive data quality report"""
    print(f"\n{'='*70}")
    print(f" DATA QUALITY REPORT: {name}")
    print(f"{'='*70}")
    
    print(f"\n1⃣  Missing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame({
        'Missing': missing,
        'Percentage': missing_pct
    })
    missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)
    
    if len(missing_df) > 0:
        print(missing_df.to_string())
    else:
        print("    No missing values detected")
    
    print(f"\n2️  Data Types:")
    print(df.dtypes.value_counts().to_string())
    
    print(f"\n3⃣  Duplicates:")
    duplicates = df.duplicated().sum()
    print(f"   Exact duplicates: {duplicates:,} ({duplicates/len(df)*100:.2f}%)")
    
    print(f"\n4️  Text Field Statistics:")
    if 'company' in df.columns:
        print(f"   Unique companies: {df['company'].nunique():,}")
        not_specified_company = (df['company'] == 'Not specified').sum()
        print(f"   'Not specified' companies: {not_specified_company:,} ({not_specified_company/len(df)*100:.2f}%)")
    
    if 'role' in df.columns:
        not_specified_role = (df['role'] == 'Not specified').sum()
        print(f"   'Not specified' roles: {not_specified_role:,} ({not_specified_role/len(df)*100:.2f}%)")
    
    if 'description' in df.columns:
        print(f"   Avg description length: {df['description'].str.len().mean():.0f} chars")
        short_desc = (df['description'].str.len() < 150).sum()
        print(f"   Short descriptions (<150 chars): {short_desc:,} ({short_desc/len(df)*100:.2f}%)")
    
    return missing_df

assess_data_quality(df_adzuna, "Adzuna")
assess_data_quality(df_hn, "HackerNews")



 DATA QUALITY REPORT: Adzuna

1️⃣  Missing Values:
               Missing  Percentage
contract_type     3446       93.36
salary_max           2        0.05

2️⃣  Data Types:
object     11
bool        4
int64       2
float64     2

3️⃣  Duplicates:
   Exact duplicates: 0 (0.00%)

4️⃣  Text Field Statistics:
   Unique companies: 1,424
   'Not specified' companies: 0 (0.00%)
   'Not specified' roles: 0 (0.00%)
   Avg description length: 500 chars
   Short descriptions (<150 chars): 0 (0.00%)

 DATA QUALITY REPORT: HackerNews

1️⃣  Missing Values:
   ✅ No missing values detected

2️⃣  Data Types:
object    7
bool      4
int64     3

3️⃣  Duplicates:
   Exact duplicates: 0 (0.00%)

4️⃣  Text Field Statistics:
   Unique companies: 635
   'Not specified' companies: 0 (0.00%)
   'Not specified' roles: 90 (12.66%)
   Avg description length: 930 chars
   Short descriptions (<150 chars): 0 (0.00%)


Unnamed: 0,Missing,Percentage


## 4. HackerNews Data Filtering

Remove invalid entries that were incorrectly captured during scraping.


In [8]:
def validate_job_posting(row):
    """Check if a row is a valid job posting"""
    
    if pd.isna(row['description']) or len(str(row['description'])) < 150:
        return False
    
    if row['role'] == 'Not specified' or pd.isna(row['role']):
        return False
    
    if row['company'] == 'Not specified' or pd.isna(row['company']):
        return False
    
    description_lower = str(row['description']).lower()
    
    job_keywords = [
        'hiring', 'looking for', 'seeking', 'position', 'role', 'job',
        'engineer', 'developer', 'designer', 'manager', 'analyst',
        'salary', 'compensation', 'benefits', 'apply', 'resume',
        'full-time', 'part-time', 'contract', 'remote'
    ]
    
    has_job_keywords = sum(1 for kw in job_keywords if kw in description_lower)
    
    if has_job_keywords < 2:
        return False
    
    question_starts = [
        "i'd argue", "if you", "does anyone", "can someone",
        "what do you", "has anyone", "why do", "how do"
    ]
    
    first_100 = description_lower[:100]
    if any(first_100.startswith(q) for q in question_starts):
        return False
    
    return True

print("🔍 Validating HackerNews job postings...\n")

initial_count = len(df_hn)

df_hn['is_valid'] = df_hn.apply(validate_job_posting, axis=1)

invalid_count = (~df_hn['is_valid']).sum()
print(f"️  Invalid entries found: {invalid_count:,} ({invalid_count/initial_count*100:.1f}%)\n")

print("Sample invalid entries:")
invalid_sample = df_hn[~df_hn['is_valid']].head(3)
for idx, row in invalid_sample.iterrows():
    print(f"\n   • Company: {row['company'][:50]}")
    print(f"     Role: {row['role'][:50]}")
    print(f"     Description (first 100 chars): {str(row['description'])[:100]}...")

df_hn_filtered = df_hn[df_hn['is_valid']].drop('is_valid', axis=1).copy()

print(f"\n Filtered HackerNews dataset:")
print(f"   Before: {initial_count:,} records")
print(f"   After: {len(df_hn_filtered):,} records")
print(f"   Removed: {initial_count - len(df_hn_filtered):,} records ({(initial_count - len(df_hn_filtered))/initial_count*100:.1f}%)")


🔍 Validating HackerNews job postings...

️  Invalid entries found: 100 (14.1%)

Sample invalid entries:

   • Company: I'd argue you're creating a
     Role: Not specified
     Description (first 100 chars): I'd argue you're creating a bit of a false dichotomy there. I completely agree that the public is ba...

   • Company: If the ones who are
     Role: Not specified
     Description (first 100 chars): If the ones who are supposed to be informing the masses are deliberately feeding them information th...

   • Company: worse yet, there is a
     Role: very short sighted.
     Description (first 100 chars): worse yet, there is a very vocal set of people (many on HN) that want to force you to develop your l...

 Filtered HackerNews dataset:
   Before: 711 records
   After: 611 records
   Removed: 100 records (14.1%)


## 5. Schema Standardization

Align column names and add missing fields to create unified schema.


In [9]:
print("🔧 Standardizing schemas...\n")

df_adzuna_std = df_adzuna.copy()
df_hn_std = df_hn_filtered.copy()

df_adzuna_std.rename(columns={'job_id': 'id'}, inplace=True)
df_hn_std.rename(columns={'comment_id': 'id'}, inplace=True)

df_hn_std['id'] = 'hn_' + df_hn_std['id'].astype(str)
df_adzuna_std['id'] = 'adz_' + df_adzuna_std['id'].astype(str)

if 'source' not in df_hn_std.columns:
    df_hn_std['source'] = 'hackernews'

adzuna_only_cols = ['salary_min', 'salary_max', 'redirect_url', 'category', 'contract_type', 'created_date']
for col in adzuna_only_cols:
    if col not in df_hn_std.columns:
        df_hn_std[col] = np.nan

hn_only_cols = ['month', 'thread_id']
for col in hn_only_cols:
    if col not in df_adzuna_std.columns:
        df_adzuna_std[col] = np.nan

all_columns = sorted(set(df_adzuna_std.columns) | set(df_hn_std.columns))

for col in all_columns:
    if col not in df_adzuna_std.columns:
        df_adzuna_std[col] = np.nan
    if col not in df_hn_std.columns:
        df_hn_std[col] = np.nan

df_adzuna_std = df_adzuna_std[all_columns]
df_hn_std = df_hn_std[all_columns]

print(" Schema standardization complete\n")
print(f"   Unified columns: {len(all_columns)}")
print(f"   Column order: alphabetical")
print(f"\n   Columns: {all_columns}")


🔧 Standardizing schemas...

 Schema standardization complete

   Unified columns: 21
   Column order: alphabetical

   Columns: ['category', 'company', 'contract_type', 'created_date', 'description', 'has_ai_keywords', 'id', 'is_remote', 'location', 'month', 'redirect_url', 'requires_js', 'requires_python', 'role', 'salary', 'salary_max', 'salary_min', 'scraped_date', 'source', 'text_length', 'thread_id']


## 6. Date Standardization


In [None]:
def standardize_dates(df, source_type):
    """Convert dates to unified datetime format (timezone-naive)"""
    df = df.copy()
    
    if source_type == 'adzuna':
        df['posting_date'] = pd.to_datetime(df['created_date'], errors='coerce', utc=True)
        df['posting_date'] = df['posting_date'].dt.tz_localize(None)
    
    elif source_type == 'hackernews':
        df['posting_date'] = pd.to_datetime(df['month'] + '-01', errors='coerce')
    
    df['posting_year'] = df['posting_date'].dt.year
    df['posting_month'] = df['posting_date'].dt.month
    df['posting_year_month'] = df['posting_date'].dt.to_period('M').astype(str)
    
    return df

print(" Standardizing dates...\n")

df_adzuna_std = standardize_dates(df_adzuna_std, 'adzuna')
df_hn_std = standardize_dates(df_hn_std, 'hackernews')

print(" Date standardization complete\n")

print("Adzuna date range:")
print(f"   Min: {df_adzuna_std['posting_date'].min()}")
print(f"   Max: {df_adzuna_std['posting_date'].max()}")
print(f"   Null dates: {df_adzuna_std['posting_date'].isnull().sum()}")

print("\nHackerNews date range:")
print(f"   Min: {df_hn_std['posting_date'].min()}")
print(f"   Max: {df_hn_std['posting_date'].max()}")
print(f"   Null dates: {df_hn_std['posting_date'].isnull().sum()}")

print("\n Records by year-month:")
print("\nAdzuna:")
print(df_adzuna_std['posting_year_month'].value_counts().sort_index().head(10).to_string())
print("\nHackerNews:")
print(df_hn_std['posting_year_month'].value_counts().sort_index().to_string())


 Standardizing dates...

 Date standardization complete

Adzuna date range:
   Min: 2025-10-21 19:26:32+00:00
   Max: 2025-10-24 13:25:49+00:00
   Null dates: 0

HackerNews date range:
   Min: 2024-05-01 00:00:00
   Max: 2024-10-01 00:00:00
   Null dates: 0

 Records by year-month:

Adzuna:
posting_year_month
2025-10    3691

HackerNews:
posting_year_month
2024-05    331
2024-10    280


## 7. Combine Datasets


In [1]:
print("🔗 Combining datasets...\n")

df_combined = pd.concat([df_adzuna_std, df_hn_std], ignore_index=True)

print("="*70)
print(" COMBINED DATASET")
print("="*70)

print(f"\n Total records: {len(df_combined):,}")
print(f"   Adzuna: {len(df_adzuna_std):,} ({len(df_adzuna_std)/len(df_combined)*100:.1f}%)")
print(f"   HackerNews: {len(df_hn_std):,} ({len(df_hn_std)/len(df_combined)*100:.1f}%)")

print(f"\n Date range: {df_combined['posting_date'].min().date()} to {df_combined['posting_date'].max().date()}")
print(f"   Total span: {(df_combined['posting_date'].max() - df_combined['posting_date'].min()).days} days")

print(f"\n Unique companies: {df_combined['company'].nunique():,}")
print(f" AI/ML jobs: {df_combined['has_ai_keywords'].sum():,} ({df_combined['has_ai_keywords'].sum()/len(df_combined)*100:.1f}%)")
print(f" Remote jobs: {df_combined['is_remote'].sum():,} ({df_combined['is_remote'].sum()/len(df_combined)*100:.1f}%)")


🔗 Combining datasets...



NameError: name 'pd' is not defined

## 8. Deduplication

Remove duplicate job postings using multiple criteria.


In [None]:
def similarity_ratio(str1, str2):
    """Calculate similarity between two strings"""
    if pd.isna(str1) or pd.isna(str2):
        return 0.0
    return SequenceMatcher(None, str(str1).lower(), str(str2).lower()).ratio()

print(" Identifying duplicates...\n")

initial_count = len(df_combined)

print("Step 1: Exact duplicates (all fields)")
exact_dupes = df_combined.duplicated().sum()
print(f"   Found: {exact_dupes:,} exact duplicates")
df_combined = df_combined.drop_duplicates()
print(f"   Remaining: {len(df_combined):,}")

print("\nStep 2: Duplicate IDs")
id_dupes = df_combined.duplicated(subset=['id']).sum()
print(f"   Found: {id_dupes:,} duplicate IDs")
df_combined = df_combined.drop_duplicates(subset=['id'])
print(f"   Remaining: {len(df_combined):,}")

print("\nStep 3: Duplicate company + role + location")
business_key_dupes = df_combined.duplicated(subset=['company', 'role', 'location']).sum()
print(f"   Found: {business_key_dupes:,} duplicates")
df_combined = df_combined.drop_duplicates(subset=['company', 'role', 'location'])
print(f"   Remaining: {len(df_combined):,}")

print("\nStep 4: Near-duplicate descriptions (company match + high similarity)")
print("   Checking for description similarity within same company...")

df_combined = df_combined.sort_values(['company', 'posting_date']).reset_index(drop=True)

to_remove = set()
companies = df_combined['company'].unique()

checked = 0
for company in companies:
    company_jobs = df_combined[df_combined['company'] == company]
    
    if len(company_jobs) > 1:
        indices = company_jobs.index.tolist()
        descriptions = company_jobs['description'].tolist()
        
        for i in range(len(indices)):
            if indices[i] in to_remove:
                continue
            
            for j in range(i + 1, len(indices)):
                if indices[j] in to_remove:
                    continue
                
                sim = similarity_ratio(descriptions[i], descriptions[j])
                
                if sim > 0.90:
                    to_remove.add(indices[j])
                    checked += 1

fuzzy_dupes = len(to_remove)
print(f"   Found: {fuzzy_dupes:,} near-duplicates (>90% similar)")

df_combined = df_combined[~df_combined.index.isin(to_remove)].reset_index(drop=True)
print(f"   Remaining: {len(df_combined):,}")

print("\n" + "="*70)
print(" DEDUPLICATION SUMMARY")
print("="*70)
print(f"\n   Initial records: {initial_count:,}")
print(f"   Final records: {len(df_combined):,}")
print(f"   Total removed: {initial_count - len(df_combined):,} ({(initial_count - len(df_combined))/initial_count*100:.1f}%)")
print(f"\n   Breakdown:")
print(f"     - Exact duplicates: {exact_dupes:,}")
print(f"     - Duplicate IDs: {id_dupes:,}")
print(f"     - Business key duplicates: {business_key_dupes:,}")
print(f"     - Fuzzy description duplicates: {fuzzy_dupes:,}")


## 9. Missing Value Handling


In [None]:
print("🔧 Handling missing values...\n")

print("Missing value counts before handling:")
missing_before = df_combined.isnull().sum()
missing_before = missing_before[missing_before > 0].sort_values(ascending=False)
print(missing_before.to_string())

text_fields = ['location', 'salary', 'category', 'contract_type']
for field in text_fields:
    if field in df_combined.columns:
        df_combined[field] = df_combined[field].fillna('Not specified')

if 'description' in df_combined.columns:
    df_combined['description'] = df_combined['description'].fillna('')

if 'redirect_url' in df_combined.columns:
    df_combined['redirect_url'] = df_combined['redirect_url'].fillna('')

critical_fields = ['company', 'role', 'source', 'posting_date']
for field in critical_fields:
    if field in df_combined.columns:
        missing = df_combined[field].isnull().sum()
        if missing > 0:
            print(f"\n⚠  WARNING: {missing} records missing critical field '{field}'")
            print("   These will be removed.")
            df_combined = df_combined.dropna(subset=[field])

print("\n" + "="*70)
print(" MISSING VALUE HANDLING COMPLETE")
print("="*70)

print("\nMissing value counts after handling:")
missing_after = df_combined.isnull().sum()
missing_after = missing_after[missing_after > 0].sort_values(ascending=False)
if len(missing_after) > 0:
    print(missing_after.to_string())
    print("\n   Note: These are intentionally kept as NaN (e.g., salary data)")
else:
    print("    No critical missing values remaining")

print(f"\n   Final record count: {len(df_combined):,}")


## 10. Data Validation


In [None]:
print(" Running final data validation...\n")

print("="*70)
print(" FINAL VALIDATION CHECKS")
print("="*70)

checks_passed = 0
checks_total = 0

checks_total += 1
unique_ids = df_combined['id'].nunique()
if unique_ids == len(df_combined):
    print(f"\n Check 1: All IDs unique ({unique_ids:,} records)")
    checks_passed += 1
else:
    print(f"\n Check 1: FAILED - Duplicate IDs found")

checks_total += 1
missing_critical = df_combined[['company', 'role', 'description', 'source']].isnull().any(axis=1).sum()
if missing_critical == 0:
    print(f" Check 2: No missing critical fields")
    checks_passed += 1
else:
    print(f" Check 2: FAILED - {missing_critical} records missing critical fields")

checks_total += 1
valid_dates = df_combined['posting_date'].notna().sum()
if valid_dates == len(df_combined):
    print(f" Check 3: All dates valid ({valid_dates:,} records)")
    checks_passed += 1
else:
    print(f"️  Check 3: {len(df_combined) - valid_dates} invalid dates")

checks_total += 1
valid_booleans = True
for col in ['has_ai_keywords', 'is_remote', 'requires_python', 'requires_js']:
    if col in df_combined.columns:
        if df_combined[col].dtype != bool:
            valid_booleans = False
if valid_booleans:
    print(f" Check 4: Boolean columns have correct type")
    checks_passed += 1
else:
    print(f" Check 4: FAILED - Boolean columns have incorrect type")

checks_total += 1
min_desc_length = df_combined['description'].str.len().min()
if min_desc_length >= 0:
    print(f" Check 5: Description lengths valid (min: {min_desc_length} chars)")
    checks_passed += 1
else:
    print(f" Check 5: FAILED - Invalid description lengths")

checks_total += 1
sources = df_combined['source'].unique()
if set(sources).issubset({'adzuna', 'hackernews'}):
    print(f" Check 6: Valid data sources: {list(sources)}")
    checks_passed += 1
else:
    print(f" Check 6: FAILED - Invalid sources found: {sources}")

print(f"\n{'='*70}")
print(f" VALIDATION RESULT: {checks_passed}/{checks_total} checks passed")
print(f"{'='*70}")

if checks_passed == checks_total:
    print("\n All validation checks passed! Data is ready for analysis.")
else:
    print(f"\n⚠  {checks_total - checks_passed} validation check(s) failed. Review issues above.")


In [None]:
print("\n Final Dataset Schema:\n")
print(df_combined.dtypes.to_string())

print(f"\n\n Dataset Shape: {df_combined.shape}")
print(f"   Records: {df_combined.shape[0]:,}")
print(f"   Columns: {df_combined.shape[1]}")
print(f"   Memory: {df_combined.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


## 11. Save Cleaned Data


In [None]:
output_file = 'data/processed/jobs_cleaned.csv'
os.makedirs('data/processed', exist_ok=True)

df_combined.to_csv(output_file, index=False)

print(" CLEANED DATASET SAVED")
print("="*70)
print(f"\n   File: {output_file}")
print(f"   Size: {os.path.getsize(output_file) / 1024 / 1024:.2f} MB")
print(f"   Records: {len(df_combined):,}")
print(f"   Columns: {len(df_combined.columns)}")

print(f"\n Phase 2.1 Complete: Data Cleaning")
print(f"\n   Next steps: Exploratory Data Analysis (04_eda.ipynb)")


## 12. Summary Statistics & Quality Report


In [None]:
print("\n" + "="*70)
print(" FINAL DATA QUALITY REPORT")
print("="*70)

print(f"\n1️  Dataset Size:")
print(f"   Total records: {len(df_combined):,}")
print(f"   Adzuna records: {(df_combined['source'] == 'adzuna').sum():,}")
print(f"   HackerNews records: {(df_combined['source'] == 'hackernews').sum():,}")

print(f"\n️⃣  Date Coverage:")
print(f"   Date range: {df_combined['posting_date'].min().date()} to {df_combined['posting_date'].max().date()}")
print(f"   Total span: {(df_combined['posting_date'].max() - df_combined['posting_date'].min()).days} days")
print(f"   Months covered: {df_combined['posting_year_month'].nunique()}")

print(f"\n3️  Companies & Roles:")
print(f"   Unique companies: {df_combined['company'].nunique():,}")
print(f"   Unique roles: {df_combined['role'].nunique():,}")
print(f"   Unique locations: {(df_combined['location'] != 'Not specified').sum():,}")

print(f"\n4⃣  Job Characteristics:")
ai_count = df_combined['has_ai_keywords'].sum()
print(f"   AI/ML jobs: {ai_count:,} ({ai_count/len(df_combined)*100:.1f}%)")
remote_count = df_combined['is_remote'].sum()
print(f"   Remote jobs: {remote_count:,} ({remote_count/len(df_combined)*100:.1f}%)")
python_count = df_combined['requires_python'].sum()
print(f"   Python required: {python_count:,} ({python_count/len(df_combined)*100:.1f}%)")
js_count = df_combined['requires_js'].sum()
print(f"   JavaScript required: {js_count:,} ({js_count/len(df_combined)*100:.1f}%)")

print(f"\n5️  Salary Information:")
has_salary = df_combined['salary_min'].notna().sum()
print(f"   Jobs with salary data: {has_salary:,} ({has_salary/len(df_combined)*100:.1f}%)")
if has_salary > 0:
    print(f"   Median min salary: ${df_combined['salary_min'].median()/1000:.0f}k")
    print(f"   Median max salary: ${df_combined['salary_max'].median()/1000:.0f}k")

print(f"\n6⃣  Data Completeness:")
print(f"   Complete company: {(df_combined['company'] != 'Not specified').sum():,} ({(df_combined['company'] != 'Not specified').sum()/len(df_combined)*100:.1f}%)")
print(f"   Complete role: {(df_combined['role'] != 'Not specified').sum():,} ({(df_combined['role'] != 'Not specified').sum()/len(df_combined)*100:.1f}%)")
print(f"   Complete location: {(df_combined['location'] != 'Not specified').sum():,} ({(df_combined['location'] != 'Not specified').sum()/len(df_combined)*100:.1f}%)")
print(f"   Complete description: {(df_combined['description'].str.len() > 0).sum():,} ({(df_combined['description'].str.len() > 0).sum()/len(df_combined)*100:.1f}%)")

print(f"\n7️  Top Hiring Companies (Top 10):")
top_companies = df_combined['company'].value_counts().head(10)
for idx, (company, count) in enumerate(top_companies.items(), 1):
    print(f"   {idx:2d}. {company[:45]:<45} {count:>4} jobs")

print(f"\n8️  Top Locations (Top 10):")
top_locations = df_combined[df_combined['location'] != 'Not specified']['location'].value_counts().head(10)
for idx, (location, count) in enumerate(top_locations.items(), 1):
    print(f"   {idx:2d}. {location[:45]:<45} {count:>4} jobs")

print("\n" + "="*70)
print(" DATA CLEANING PHASE COMPLETE")
print("="*70)


In [None]:
print("\n Sample cleaned records:\n")
sample_cols = ['company', 'role', 'location', 'has_ai_keywords', 'is_remote', 'source', 'posting_date']
df_combined[sample_cols].head(10)
