# Adult Census Income Dataset - Model Training

This notebook demonstrates the training of various classification models for the Adult Census Income dataset.

In [24]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)

Load  Data

In [25]:
df = pd.read_csv("../data/adult.csv")

df = df.replace("?", np.nan)
df.dropna(inplace=True)

X = df.drop("income", axis=1)
y = df["income"].map({"<=50K": 0, ">50K": 1})

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [27]:
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False    
        ), categorical_cols)
    ]
)

Initialize Models

In [28]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    )
}

Train Models & Evaluate Metrics

In [29]:
results = []

for name, model in models.items():
    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("model", model)
        ]
    )

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })

    joblib.dump(pipe, f"../models/{name.lower().replace(' ', '_')}.pkl")

    print(f"{name} trained and evaluated")

Logistic Regression trained and evaluated
Decision Tree trained and evaluated
KNN trained and evaluated
Naive Bayes trained and evaluated
Random Forest trained and evaluated


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained and evaluated


Save Metrics Table

In [30]:
results_df = pd.DataFrame(results)
results_df.to_csv("../data/model_results.csv", index=False)

results_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.854301,0.913589,0.750201,0.621838,0.680015,0.591088
1,Decision Tree,0.815183,0.751002,0.630303,0.623169,0.626716,0.503925
2,KNN,0.834079,0.867278,0.683248,0.621838,0.651098,0.543609
3,Naive Bayes,0.601028,0.830016,0.379494,0.948735,0.542134,0.387561
4,Random Forest,0.85629,0.910527,0.74902,0.635819,0.687793,0.598636
5,XGBoost,0.872866,0.934065,0.789598,0.667111,0.723205,0.645282
