In [1]:
!pip install xgboost




In [2]:
# ==========================================
# Adult Income Classification
# ==========================================

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ==========================================
# 1Ô∏è‚É£ Load Dataset
# ==========================================

print("Loading dataset...")

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

df = pd.read_csv(url, names=columns, sep=", ", engine="python")

# ==========================================
# 2Ô∏è‚É£ Data Cleaning
# ==========================================

df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

print("Dataset shape after cleaning:", df.shape)

# ==========================================
# 3Ô∏è‚É£ Encode Categorical Variables
# ==========================================

label_encoders = {}

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# ==========================================
# 4Ô∏è‚É£ Split Dataset
# ==========================================

X = df.drop("income", axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ==========================================
# 5Ô∏è‚É£ Initialize Models (Size Optimized)
# ==========================================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),

    "Decision Tree Classifier": DecisionTreeClassifier(
        max_depth=10,   # limit depth ‚Üí smaller file
        random_state=42
    ),

    "K-Nearest Neighbor Classifier": KNeighborsClassifier(
        n_neighbors=5
    ),

    "Naive Bayes Classifier": GaussianNB(),

    "Random Forest (Ensemble)": RandomForestClassifier(
        n_estimators=50,   # reduced trees
        max_depth=12,
        random_state=42
    ),

    "XGBoost (Ensemble)": XGBClassifier(
        n_estimators=50,      # reduced estimators
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    )
}

# ==========================================
# 6Ô∏è‚É£ Train & Evaluate
# ==========================================

results = []

print("\nTraining Models...\n")

for name, model in models.items():

    print(f"Training {name}...")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    else:
        y_prob = y_pred

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([
        name,
        accuracy,
        auc,
        precision,
        recall,
        f1,
        mcc
    ])

    print(f"{name} completed.")

# ==========================================
# 7Ô∏è‚É£ Results Table + Ranking
# ==========================================

results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC Score",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC"
    ]
)

# Add Ranking (Based on F1 Score)
results_df["Rank"] = results_df["F1 Score"].rank(
    ascending=False,
    method="dense"
).astype(int)

# Sort by Rank
results_df = results_df.sort_values("Rank")

print("\nFinal Model Comparison (Ranked by F1 Score):\n")
print(results_df)

# ==========================================
# 8Ô∏è‚É£ Save Models (Compressed)
# ==========================================

print("\nSaving models...")

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    filename = name.lower().replace(" ", "_").replace("(", "").replace(")", "") + ".pkl"
    joblib.dump(
        model,
        f"model/{filename}",
        compress=3   # üî• reduces file size significantly
    )

joblib.dump(label_encoders, "model/label_encoders.pkl", compress=3)


Loading dataset...
Dataset shape after cleaning: (30162, 15)

Training Models...

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression completed.
Training Decision Tree Classifier...
Decision Tree Classifier completed.
Training K-Nearest Neighbor Classifier...
K-Nearest Neighbor Classifier completed.
Training Naive Bayes Classifier...
Naive Bayes Classifier completed.
Training Random Forest (Ensemble)...
Random Forest (Ensemble) completed.
Training XGBoost (Ensemble)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost (Ensemble) completed.

Final Model Comparison (Ranked by F1 Score):

                           Model  Accuracy  AUC Score  Precision    Recall  \
4       Random Forest (Ensemble)  0.857119   0.914436   0.782686  0.589880   
1       Decision Tree Classifier  0.850986   0.884813   0.749380  0.603196   
5             XGBoost (Ensemble)  0.853970   0.912733   0.787766  0.565912   
0            Logistic Regression  0.802752   0.810695   0.678899  0.394141   
2  K-Nearest Neighbor Classifier  0.770927   0.663644   0.570588  0.322903   
3         Naive Bayes Classifier  0.786508   0.828922   0.656891  0.298269   

   F1 Score       MCC  Rank  
4  0.672741  0.593194     1  
1  0.668388  0.579351     2  
5  0.658659  0.581519     3  
0  0.498736  0.408691     4  
2  0.412415  0.301210     5  
3  0.410256  0.336790     6  

Saving models...


['model/label_encoders.pkl']

In [3]:
import shutil
from google.colab import files

# Zip the model folder
shutil.make_archive("model", 'zip', "model")

# Download the zip file
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>