In [13]:
# ========================================
# ðŸ““ bias_auditor.ipynb
# Author: Jonah Leichenberg
# Purpose: Train model, detect bias, and save results for Streamlit frontend
# ========================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from fairlearn.metrics import MetricFrame, selection_rate, demographic_parity_difference
from imblearn.over_sampling import SMOTE
import joblib, os
import matplotlib.pyplot as plt

# âœ… Load data
columns = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race","sex",
    "capital-gain","capital-loss","hours-per-week","native-country","income"
]

train_df = pd.read_csv("../data/adult_train.csv", names=columns, sep=r'\s*,\s*', engine='python', na_values='?')
test_df  = pd.read_csv("../data/adult_test.csv",  names=columns, sep=r'\s*,\s*', engine='python', skiprows=1, na_values='?')

data = pd.concat([train_df, test_df], ignore_index=True)
data = data.dropna()
data['income'] = data['income'].str.strip().replace({'>50K.':'>50K','<=50K.':'<=50K'})

# âœ… Encode and split
df = data.copy()
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# âœ… Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# âœ… Bias detection
sex = X_test['sex']
metric = MetricFrame(metrics=selection_rate, y_true=y_test, y_pred=y_pred, sensitive_features=sex)
dpd = demographic_parity_difference(y_test, y_pred, sensitive_features=sex)
print("Demographic Parity Difference (Before):", dpd)

# âœ… Debias using SMOTE
sm = SMOTE()
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
model.fit(X_train_res, y_train_res)
y_pred_res = model.predict(X_test)
dpd_res = demographic_parity_difference(y_test, y_pred_res, sensitive_features=sex)
print("Demographic Parity Difference (After):", dpd_res)

# âœ… Save results for app
os.makedirs("../data", exist_ok=True)
joblib.dump((X_test, y_test, y_pred, metric), "../data/results.pkl")
unbiased_df = X_test.copy()
unbiased_df['income_pred'] = y_pred_res
unbiased_df.to_csv("../data/unbiased_dataset.csv", index=False)



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Demographic Parity Difference (Before): 0.06479042445365643


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Demographic Parity Difference (After): 0.18275423997967352
