In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
adultDataSet_filename = os.path.join(os.getcwd(), "data", "censusData.csv")
df = pd.read_csv(adultDataSet_filename)

df.head()
df.info()
df.describe(include='all')

In [None]:
# Check for missing values
df.isnull().sum()

df.dropna(inplace=True)
df.head()

# Boxplot to detect outliers
sns.boxplot(x=df["hours-per-week"], y=df["income_binary"], data = df, orient="h")
plt.title("Outlier Detection: Hours per Week")
plt.show()

# Cap extreme values (optional depending on distribution)
df['hours-per-week'] = np.where(df['hours-per-week'] > 80, 80, df['hours-per-week'])

In [None]:
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                    'relationship', 'race', 'sex_selfID', 'native-country']

print(df.columns.tolist())

numeric_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_cols = [col for col in df.columns if col.startswith(('workclass_', 'education_', 'marital-status_','occupation_', 'relationship_', 'race_',
                                                                 'native-country_', 'sex_selfID'))]
model_features = numeric_cols + categorical_cols

X = df[model_features]
y = df['income_binary']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
