In [1]:
import pandas as pd
import numpy as np
import time
import os
import joblib

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [8]:
os.makedirs("models", exist_ok=True)
os.makedirs("data", exist_ok=True)

In [9]:
# Dataset
# Load Adult Census Income dataset (classification)

columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race",
            "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
# Load training data
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
train_df = pd.read_csv(train_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)

# Load test data
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
test_df = pd.read_csv(test_url, header=0, names=columns, na_values=" ?", skipinitialspace=True)

# Remove trailing period in test labels
test_df["income"] = test_df["income"].str.replace(".", "", regex=False)

# Combine datasets
df = pd.concat([train_df, test_df], ignore_index=True)

print("Dataset shape:", df.shape)
display(df)

Dataset shape: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K


In [10]:
# 1. Basic cleaning

# Remove rows with missing values
df = df.replace("?", np.nan)
df = df.dropna()
df["income"] = df["income"].apply(lambda x: 1 if x == ">50K" else 0) # Convert target to binary

# 2. Feature / target split
X = df.drop("income", axis=1)
y = df["income"]

# 3. Identify categorical columns
categorical_cols = X.select_dtypes(include="object").columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True) # One-hot encoding

# 4. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Feature scaling
scaler = StandardScaler()
X_train_processed = scaler.fit_transform(X_train)
X_test_processed = scaler.transform(X_test)

print("Train shape:", X_train_processed.shape)
print("Test shape:", X_test_processed.shape)
display(X_train_processed)

Train shape: (36177, 96)
Test shape: (9045, 96)


array([[-1.40736572,  0.64817975, -0.04667295, ...,  0.3092066 ,
        -0.04209768, -0.02292317],
       [-0.95259943, -1.4471909 , -0.04667295, ...,  0.3092066 ,
        -0.04209768, -0.02292317],
       [ 0.41169945, -0.62832292, -1.99913269, ...,  0.3092066 ,
        -0.04209768, -0.02292317],
       ...,
       [ 0.33590506, -0.73609685,  0.343819  , ...,  0.3092066 ,
        -0.04209768, -0.02292317],
       [-0.42203876, -0.05869456,  0.73431095, ...,  0.3092066 ,
        -0.04209768, -0.02292317],
       [-1.17998258,  0.22681976, -0.4371649 , ...,  0.3092066 ,
        -0.04209768, -0.02292317]])

In [11]:
y.value_counts()

Unnamed: 0_level_0,count
income,Unnamed: 1_level_1
0,34014
1,11208


In [12]:
# Classification Models

cmodels = {
    "Logistic Regression": LogisticRegression(max_iter=1000, n_jobs=-1),
    "Decision Tree Classifier": DecisionTreeClassifier(
                                max_depth=15,          # prevents overfitting
                                min_samples_split=20, min_samples_leaf=10,
                                random_state=42),

    "K-Nearest Neighbor Classifier": KNeighborsClassifier(
                                n_neighbors=15,   # good balance
                                weights="distance",
                                n_jobs=-1),

    "Naive Bayes Classifier - Gaussian or Multinomial": GaussianNB(),

    "Ensemble Model - Random Forest": RandomForestClassifier(
                                n_estimators=200, max_depth=None,
                                min_samples_split=5, min_samples_leaf=2,
                                n_jobs=-1, random_state=42),

    "Ensemble Model - XGBoost": XGBClassifier(
                                n_estimators=300, max_depth=6,
                                learning_rate=0.05,
                                subsample=0.8, colsample_bytree=0.8,
                                eval_metric="logloss",
                                n_jobs=-1, random_state=42)
}

In [13]:
def train_model(model, X_train, y_train):
    """
    Train the model

    Args:
        model: initialized sklearn model
        X_train: training features
        y_train: training labels

    Returns:
        trained model and training_time
    """
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    return model, training_time

In [14]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate classification model

    Returns:
        dict of evaluation metrics
    """
    y_pred = model.predict(X_test)

    # Needed for AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    print("\nConfusion Matrix")
    print(confusion_matrix(y_test, y_pred))

    print("\nClassification Report")
    print(classification_report(y_test, y_pred))

    return metrics

In [15]:
# Empty Dictionary to store the model results
results = {}

for model_name, model in cmodels.items():
  print(f"\nProcessing {model_name} model")
  model, train_time = train_model(model, X_train_processed, y_train)
  print("Training Completed and starting evaluation")
  metrics = evaluate_model(model, X_test_processed, y_test)
  metrics["Training Time"] = train_time
  results[model_name] = metrics
  print(f"Saving {model_name}")
  joblib.dump(model, f"models/{model_name.lower().replace(" ", "_")}.pkl")
  print("*"*70)



Processing Logistic Regression model
Training Completed and starting evaluation

Confusion Matrix
[[6323  480]
 [ 927 1315]]

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      6803
           1       0.73      0.59      0.65      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.76      0.78      9045
weighted avg       0.84      0.84      0.84      9045

Saving Logistic Regression
**********************************************************************

Processing Decision Tree Classifier model
Training Completed and starting evaluation

Confusion Matrix
[[6361  442]
 [ 941 1301]]

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      6803
           1       0.75      0.58      0.65      2242

    accuracy                           0.85      9045
   macro avg       0.81      0.76      0.78      9045


In [16]:
# Forming the comparision dataframe
comparison_df = pd.DataFrame(results).T.reset_index()
comparison_df.rename(columns={"index": "Model"}, inplace=True)
display(comparison_df.sort_values(by='MCC', ascending=False))

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1,MCC,Training Time
5,Ensemble Model - XGBoost,0.86932,0.926285,0.788986,0.645406,0.71001,0.632021,3.859909
4,Ensemble Model - Random Forest,0.858043,0.915644,0.777199,0.599019,0.676574,0.595758,6.611809
1,Decision Tree Classifier,0.847098,0.884649,0.746414,0.580285,0.652949,0.564119,0.368762
0,Logistic Regression,0.844444,0.90155,0.732591,0.58653,0.651474,0.558591,1.887932
2,K-Nearest Neighbor Classifier,0.829851,0.876465,0.685097,0.580285,0.628351,0.522008,0.020404
3,Naive Bayes Classifier - Gaussian or Multinomial,0.565616,0.806723,0.357877,0.947368,0.519506,0.351953,0.059103


In [17]:
# Combine X_test and y_test
test_df = pd.DataFrame(X_test)
test_df["target"] = y_test.values

# Save to CSV
test_df.to_csv("data/test_data.csv", index=False)
print("Test data saved to data/test_data.csv")

print("Saving processed data")
test_df_processed = pd.DataFrame(X_test_processed)
test_df_processed["target"] = y_test.values
test_df_processed.to_csv("data/test_data_processed.csv", index=False)

Test data saved to data/test_data.csv
Saving processed data
