In [None]:
# %% [markdown]
# # Strategic Workforce Analysis: AI Integration vs. Structural Risk (2010-2025)
# ## Task 1: Data Preprocessing & Outlier Handling

# %%
import pandas as pd
import numpy as np
import os

# --- Local Path Configuration ---
BASE_DIR = os.path.dirname(os.getcwd())
RAW_DATA_PATH = os.path.join(BASE_DIR, 'data', 'raw', 'ai_impact_jobs_2010_2025.csv')
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')

# %%
# 1. Load Dataset
if os.path.exists(RAW_DATA_PATH):
    df = pd.read_csv(RAW_DATA_PATH)
    print(f"Raw Data Loaded from local. Shape: {df.shape}")
else:
    print(f"Error: File not found at {RAW_DATA_PATH}")

# 2. Basic Cleaning
df['ai_skills'] = df['ai_skills'].fillna('Not Specified')
df['ai_keywords'] = df['ai_keywords'].fillna('None')

# 3. IQR-based Outlier Removal for Salary
# We use IQR to focus our analysis on the mainstream market behavior
Q1 = df['salary_usd'].quantile(0.25)
Q3 = df['salary_usd'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

df_cleaned = df[(df['salary_usd'] >= lower_limit) & (df['salary_usd'] <= upper_limit)].copy()

# %%
# 4. Save the Cleaned Dataset for subsequent tasks
# We save this to ensure consistency across all analysis branches
os.makedirs(PROCESSED_DIR, exist_ok=True)

CLEANED_FILE_PATH = os.path.join(PROCESSED_DIR, 'ai_impact_jobs_cleaned.csv')
df_cleaned.to_csv(CLEANED_FILE_PATH, index=False)

print(f"Processed data saved to: {CLEANED_FILE_PATH}")

âœ… Raw Data Loaded from local. Shape: (5000, 22)
Processed data saved to: /Users/miraekang/proyectos/eda/data/processed/ai_impact_jobs_cleaned.csv
