In [1]:
# =====================================
# STEP 1: DATASET DOWNLOAD & LOADING
# =====================================

import pandas as pd
import numpy as np
import urllib.request
import os

# UCI Adult Income Dataset URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
file_name = "adult.csv"

# Download dataset if not already present
if not os.path.exists(file_name):
    urllib.request.urlretrieve(url, file_name)
    print("Dataset downloaded successfully from UCI Repository.")
else:
    print("Dataset already exists.")

# Column names as per UCI documentation
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

# Load dataset
df = pd.read_csv(file_name, names=columns)

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

# Handle missing values
df.replace(" ?", np.nan, inplace=True)
df.dropna(inplace=True)

print("\nDataset Shape After Removing Missing Values:", df.shape)


Dataset downloaded successfully from UCI Repository.
Dataset Shape: (32561, 15)

First 5 rows:
   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week

In [2]:
# =====================================
# STEP 2: ML MODELS & EVALUATION
# (SIZE-OPTIMIZED FOR DEPLOYMENT)
# =====================================

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

import joblib
import os

# -------------------------------
# Encode categorical features
# -------------------------------
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])

# -------------------------------
# Split features and target
# -------------------------------
X = df.drop("income", axis=1)
y = df["income"]

# SAVE FEATURE NAMES (CRITICAL FIX)
feature_names = X.columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------------------
# Feature Scaling
# -------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------------
# SIZE-OPTIMIZED MODELS
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),

    "Decision Tree": DecisionTreeClassifier(
        max_depth=10,
        random_state=42
    ),

    "K-Nearest Neighbors": KNeighborsClassifier(
        n_neighbors=7
    ),

    "Naive Bayes (Gaussian)": GaussianNB(),

    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    ),

    "AdaBoost (Ensemble)": AdaBoostClassifier(
        n_estimators=100,
        random_state=42
    )
}

# -------------------------------
# Model Training & Evaluation
# -------------------------------
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append([
        model_name,
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ])

# -------------------------------
# Results Table
# -------------------------------
results_df = pd.DataFrame(
    results,
    columns=[
        "Model",
        "Accuracy",
        "AUC Score",
        "Precision",
        "Recall",
        "F1 Score",
        "MCC Score"
    ]
)

print("\nMODEL PERFORMANCE COMPARISON\n")
print(results_df)

# -------------------------------
# SAVE MODELS + SCALER + FEATURES
# -------------------------------
os.makedirs("model", exist_ok=True)

joblib.dump(feature_names, "model/feature_names.pkl", compress=3)  # âœ… IMPORTANT
joblib.dump(scaler, "model/scaler.pkl", compress=3)

joblib.dump(models["Logistic Regression"], "model/logistic_regression.pkl", compress=3)
joblib.dump(models["Decision Tree"], "model/decision_tree.pkl", compress=3)
joblib.dump(models["K-Nearest Neighbors"], "model/knn.pkl", compress=3)
joblib.dump(models["Naive Bayes (Gaussian)"], "model/naive_bayes.pkl", compress=3)
joblib.dump(models["Random Forest"], "model/random_forest.pkl", compress=3)
joblib.dump(models["AdaBoost (Ensemble)"], "model/adaboost.pkl", compress=3)

print("\nAll models, scaler, and feature names saved successfully.")



MODEL PERFORMANCE COMPARISON

                    Model  Accuracy  AUC Score  Precision    Recall  F1 Score  \
0     Logistic Regression  0.822974   0.859774   0.744186  0.460131  0.568659   
1           Decision Tree  0.855130   0.890692   0.736311  0.667974  0.700480   
2     K-Nearest Neighbors  0.830433   0.869344   0.691900  0.597386  0.641179   
3  Naive Bayes (Gaussian)  0.798442   0.859477   0.709893  0.347059  0.466198   
4           Random Forest  0.858942   0.917437   0.822412  0.566013  0.670538   
5     AdaBoost (Ensemble)  0.855296   0.907748   0.786399  0.589542  0.673889   

   MCC Score  
0   0.486253  
1   0.606487  
2   0.533399  
3   0.394552  
4   0.601175  
5   0.593336  

All models, scaler, and feature names saved successfully.


In [3]:
!zip -r model.zip model


  adding: model/ (stored 0%)
  adding: model/knn.pkl (deflated 2%)
  adding: model/adaboost.pkl (deflated 0%)
  adding: model/naive_bayes.pkl (stored 0%)
  adding: model/scaler.pkl (stored 0%)
  adding: model/random_forest.pkl (deflated 1%)
  adding: model/feature_names.pkl (stored 0%)
  adding: model/logistic_regression.pkl (stored 0%)
  adding: model/decision_tree.pkl (deflated 0%)
