# Table of Contents
- [Import Packages](#Packages)
- [Data Cleaning](#Data-Cleaning)
- [Data Exploration](#Data-Exploration)
- [Models](#Models)
    - [OLS](#OLS)


## Packages

In [3]:
import os
import time
import csv
import subprocess
import psutil
from datetime import datetime
from fuzzywuzzy import fuzz,process
import pandas as pd


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import numpy as np
import seaborn as sns
from feature_engine.imputation import CategoricalImputer

from boruta import BorutaPy

from fancyimpute import KNN
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score, roc_auc_score, recall_score,confusion_matrix, ConfusionMatrixDisplay,roc_curve
from sklearn.inspection import PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


import category_encoders as ce
import zipfile
import sweetviz as sv

import xgboost as xgb
import optuna
import optuna.visualization as vis

import matplotlib.pyplot as plt

edu_mapping = {
    "000": 0,   # NIU or no schooling
    "001": 0,   # NIU or blank
    "002": 0,   # None, preschool, or kindergarten

    # Grades 1–8
    "010": 2,
    "011": 1,
    "012": 2,
    "013": 3,
    "014": 4,
    "020": 5,
    "021": 5,
    "022": 6,
    "030": 7,
    "031": 7,
    "032": 8,

    # High school
    "040": 9,
    "050": 10,
    "060": 11,
    "070": 12,
    "071": 11,
    "072": 12,
    "073": 12,

    # College
    "080": 13,
    "081": 13,
    "090": 14,
    "091": 14,
    "092": 14,
    "100": 15,
    "110": 16,
    "111": 16,
    "112": 17,
    "120": 17,
    "121": 18,
    "123": 18,
    "124": 19,
    "125": 20,

    # Missing
    "999": np.nan
}

def recode_educ_column(series: pd.Series) -> pd.Series:
    # Convert to string, zero-fill to length 3, then map with the dictionary
    codes_str = series.astype(str).str.zfill(3)
    return codes_str.map(edu_mapping).fillna(np.nan)


age_bins = [0, 18, 25, 35, 45, 55, 65, np.inf]
age_labels = ["<18", "18–24", "25–34", "35–44", "45–54", "55–64", "65+"]

def pick_two_parents(row):
    # Gather all four parent columns into a list
    ages = [row["AGE_MOM"], row["AGE_MOM2"], row["AGE_POP"], row["AGE_POP2"]]
    valid = [age for age in ages if pd.notna(age)]
    
    parent1 = valid[0] if len(valid) > 0 else np.nan
    parent2 = valid[1] if len(valid) > 1 else np.nan
    
    return pd.Series([parent1, parent2], index=["parent1_age", "parent2_age"])

# Apply the function row-by-row



## Data-Cleaning

In [18]:
df = pd.read_csv("cps_00018.csv")

# Apply the vectorized recode to the EDUC column
df["EDUC"] = recode_educ_column(df["EDUC"])



### Convert Age into buckets
age_cols = ["AGE_MOM", "AGE_MOM2", "AGE_POP", "AGE_POP2"]

df = df.assign(
    **{
        f"{col}_bucket": pd.cut(
            df[col],
            bins=age_bins,
            labels=age_labels,
            right=False
        )
        for col in age_cols  # e.g. ["AGE_MOM","AGE_MOM2","AGE_POP","AGE_POP2"]
    }
)


# Extract those four columns as a NumPy array (N rows x 4 columns)


# Prepare empty arrays for the results
parent1 = np.full(len(df), np.nan, dtype=object)
parent2 = np.full(len(df), np.nan, dtype=object)



bucketed_cols = [f"{col}_bucket" for col in age_cols]

arr = df[bucketed_cols].astype(object).to_numpy()


for i in range(len(df)):
    row_ages = arr[i]  # e.g., ["18–24", np.nan, "<18", ...]
    valid = [x for x in row_ages if pd.notna(x)]
    
    if len(valid) > 0:
        parent1[i] = valid[0]
    if len(valid) > 1:
        parent2[i] = valid[1]


# Assign the results back to the DataFrame
df["PARENT1_AGE_BUCKET"] = parent1
df["PARENT2_AGE_BUCKET"] = parent2



# List the 4 parent-education columns
educ_cols = ['EDUC_MOM', 'EDUC_MOM2', 'EDUC_POP', 'EDUC_POP2']

# Convert these columns to a NumPy array (N rows x 4 columns)
# Make sure the dtype allows for NaNs (float, object, etc.)
arr_educ = df[educ_cols].to_numpy(dtype=float)

# Prepare empty arrays for storing the first and second valid education
parent1_educ = np.full(len(df), np.nan)
parent2_educ = np.full(len(df), np.nan)

# Loop over the rows in NumPy (faster than row-by-row .apply() in large datasets)
for i in range(len(df)):
    row_values = arr_educ[i]
    # Filter out NaNs
    valid_educs = row_values[~np.isnan(row_values)]
    
    # Assign up to two valid education codes
    if len(valid_educs) > 0:
        parent1_educ[i] = valid_educs[0]  # first non-NaN
    if len(valid_educs) > 1:
        parent2_educ[i] = valid_educs[1]  # second non-NaN

# Attach the arrays back to your DataFrame as new columns
df["PARENT1_EDUC"] = parent1_educ
df["PARENT2_EDUC"] = parent2_educ


### Get hourly wage
df['hr_wage'] = df['INCWAGE']/df['UHRSWORKT']


### Get Tenure
df['TENURE'] = df['AGE'] - df['EDUC'] - 7




categorical_columns  = ["SEX", "RACE", "YEAR", "OCC",'IND1990','AGE_BUCKET','VETSTAT','PARENT1_AGE_BUCKET','PARENT2_AGE_BUCKET']
numeric_columns  = [ 'EDUC', 'PARENT1_EDUC','PARENT2_EDUC','TENURE']


target_variable  = "INCWAGE"  # Dependent variable
weight_column  = "weight"



df.head()

In [17]:
len(df)

5186113

## Data-Exploration

In [16]:
report = sv.analyze(df)
report.show_html("sweetviz_report.html")


                                             |          | [  0%]   00:00 -> (? left)

Report sweetviz_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
---
(likely due to only having a single row, containing non-NaN values for both correlated features)
Affected correlations:['MONTH/ASECWTH', 'MONTH/HHINCOME', 'MONTH/ASECWT', 'MONTH/INCWAGE', 'ASECWTH/MONTH', 'HHINCOME/MONTH', 'ASECWT/MONTH', 'INCWAGE/MONTH']


## Models

### OLS

In [None]:

# -------------------------------------------------------------------
# Measure initial resource usage
# -------------------------------------------------------------------
process = psutil.Process()
start_cpu_times = process.cpu_times()
mem_info_start = process.memory_info().rss  # Resident Set Size in bytes

start_time = time.time()

# -------------------------------------------------------------------
# Prepare features (X) and target (y)
# -------------------------------------------------------------------
# 1. Separate numeric features
X_numeric = df[numeric_columns]

# 2. Convert categorical columns to string (to avoid issues with non-string types)
df_categorical_str = df[categorical_columns].astype(str)

# 3. One-hot encode categorical features
df_one_hot = pd.get_dummies(df_categorical_str)

# 4. Concatenate numeric and one-hot-encoded features
X = pd.concat([X_numeric, df_one_hot], axis=1)

# 5. Define the target variable (y) and sample weights
y = df[target_variable]
sample_weight = df[weight_column]

# -------------------------------------------------------------------
# Fit the linear regression model
# -------------------------------------------------------------------
model = LinearRegression()
model.fit(X, y, sample_weight=sample_weight)

# -------------------------------------------------------------------
# Evaluate the model (R² score)
# -------------------------------------------------------------------
r2 = model.score(X, y, sample_weight=sample_weight)

# -------------------------------------------------------------------
# Measure time and resource usage after model training
# -------------------------------------------------------------------
end_time = time.time()
end_cpu_times = process.cpu_times()
mem_info_end = process.memory_info().rss

# Calculate the differences
run_time = end_time - start_time
cpu_time_used = (
    (end_cpu_times.user + end_cpu_times.system)
    - (start_cpu_times.user + start_cpu_times.system)
)
mem_used = (mem_info_end - mem_info_start) / (1024 * 1024)  # Convert bytes to MB

# -------------------------------------------------------------------
# Print out metrics
# -------------------------------------------------------------------
print(f"R² Score: {r2:.4f}")
print(f"Time taken: {run_time:.4f} seconds")
print(f"CPU time used: {cpu_time_used:.4f} seconds (user + system)")
print(f"Approx. additional RAM used: {mem_used:.4f} MB")