In [7]:
!pip install xgboost




In [10]:
# ==========================================
# Adult Income Classification - Full Training
# ==========================================

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ==========================================
# 1️⃣ Load Dataset
# ==========================================

print("Loading dataset...")

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

df = pd.read_csv(url, names=columns, sep=", ", engine="python")

# ==========================================
# 2️⃣ Data Cleaning
# ==========================================

df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

print("Dataset shape after cleaning:", df.shape)

# ==========================================
# 3️⃣ Encode Categorical Variables
# ==========================================

label_encoders = {}

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# ==========================================
# 4️⃣ Split Dataset
# ==========================================

X = df.drop("income", axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ==========================================
# 5️⃣ Initialize Models
# ==========================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=50,
        max_depth=10,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        n_estimators=50,
        max_depth=6,
        random_state=42
    )
}

# ==========================================
# 6️⃣ Train & Evaluate
# ==========================================

results = []

print("\nTraining Models...\n")

for name, model in models.items():

    print(f"Training {name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Probability needed for AUC
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        name,
        accuracy,
        auc,
        precision,
        recall,
        f1,
        mcc
    ])

    print(f"{name} completed.")

# ==========================================
# 7️⃣ Results Table
# ==========================================

results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC Score",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

print("\nFinal Model Comparison:\n")
print(results_df)

# ==========================================
# 8️⃣ Save Models
# ==========================================

print("\nSaving models...")

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")

joblib.dump(label_encoders, "model/label_encoders.pkl")

Loading dataset...
Dataset shape after cleaning: (30162, 15)

Training Models...

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression completed.
Training Decision Tree...
Decision Tree completed.
Training KNN...
KNN completed.
Training Naive Bayes...
Naive Bayes completed.
Training Random Forest...
Random Forest completed.
Training XGBoost...
XGBoost completed.

Final Model Comparison:

                 Model  Accuracy  AUC Score  Precision    Recall  F1 Score  \
0  Logistic Regression  0.802752   0.810695   0.678899  0.394141  0.498736   
1        Decision Tree  0.806564   0.740368   0.612190  0.608522  0.610351   
2                  KNN  0.770927   0.663644   0.570588  0.322903  0.412415   
3          Naive Bayes  0.786508   0.828922   0.656891  0.298269  0.410256   
4        Random Forest  0.852478   0.912829   0.798828  0.544607  0.647664   
5              XGBoost  0.865407   0.924381   0.774682  0.647803  0.705584   

        MCC  
0  0.408691  
1  0.481704  
2  0.301210  
3  0.336790  
4  0.574951  
5  0.623406  

Saving models...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['model/label_encoders.pkl']

In [11]:
import shutil
from google.colab import files

# Zip the model folder
shutil.make_archive("model", 'zip', "model")

# Download the zip file
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>