In [1]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np

# Step 2: Load the dataset
df = pd.read_csv("postings.csv")

# Step 3: Clean Missing Data
# Drop rows with excessive missing values (less than 5 non-NA values)
df.dropna(thresh=5, inplace=True)

# Fill missing values in salary-related columns with median
for col in ['normalized_salary', 'max_salary', 'min_salary', 'med_salary']:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Drop rows with missing 'title' or 'description'
df.dropna(subset=['title', 'description'], inplace=True)

# Step 4: Remove Duplicates
df.drop_duplicates(inplace=True)

# Step 5: Edit Metadata (Rename & Convert Types)
# Convert listed_time to datetime
if 'listed_time' in df.columns:
    df['listed_time'] = pd.to_datetime(df['listed_time'], errors='coerce', unit='ms')

# Rename columns for clarity
df.rename(columns={
    'normalized_salary': 'salary_usd',
    'description': 'job_description',
    'title': 'job_title'
}, inplace=True)

# Step 6: Apply Transformations

# Create a new column for job description length
df['desc_length'] = df['job_description'].apply(lambda x: len(str(x)))

# Create salary buckets
def salary_bucket(salary):
    if pd.isna(salary):
        return 'Unknown'
    elif salary < 40000:
        return 'Low'
    elif 40000 <= salary < 80000:
        return 'Medium'
    else:
        return 'High'

df['salary_bucket'] = df['salary_usd'].apply(salary_bucket)

# Step 7: Save Cleaned Dataset for AutoML
df.to_csv("postings_cleaned_for_automl.csv", index=False)
print("✅ Cleaned dataset saved as 'postings_cleaned_for_automl.csv'")


✅ Cleaned dataset saved as 'postings_cleaned_for_automl.csv'
