In [1]:
!pip install xgboost




In [2]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


# ==============================
# 1️⃣ Download Dataset from UCI
# ==============================

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"



columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race",
    "sex", "capital-gain", "capital-loss", "hours-per-week",
    "native-country", "income"
]

df = pd.read_csv(url, names=columns, sep=", ", engine="python")
df.to_csv("adult_dataset.csv", index=False)

print("Shape:", df.shape)


# ==============================
# 2️⃣ Data Cleaning
# ==============================

df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

print("Shape after removing missing values:", df.shape)


# ==============================
# 3️⃣ Encode Categorical Variables
# ==============================

label_encoders = {}

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le


# ==============================
# 4️⃣ Split Data
# ==============================

X = df.drop("income", axis=1)
y = df["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# ==============================
# 5️⃣ Feature Scaling
# ==============================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# ==============================
# 6️⃣ Define Models
# ==============================

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


# ==============================
# 7️⃣ Train & Evaluate
# ==============================

results = []

for name, model in models.items():

    print(f"\nTraining {name}...")

    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append([name, accuracy, auc, precision, recall, f1, mcc])

    print(f"{name} Completed")


# ==============================
# 8️⃣ Results Table
# ==============================

results_df = pd.DataFrame(results, columns=[
    "Model", "Accuracy", "AUC", "Precision", "Recall", "F1 Score", "MCC"
])

print("\nFinal Results:")
print(results_df)


# ==============================
# 9️⃣ Save Models
# ==============================

os.makedirs("model", exist_ok=True)

for name, model in models.items():
    filename = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"model/{filename}")

joblib.dump(rf_model, "model/random_forest.pkl")
# Save scaler
joblib.dump(scaler, "model/scaler.pkl")

# Save label encoders
joblib.dump(label_encoders, "model/label_encoders.pkl")




Shape: (32561, 15)
Shape after removing missing values: (30162, 15)

Training Logistic Regression...
Logistic Regression Completed

Training Decision Tree...
Decision Tree Completed

Training KNN...
KNN Completed

Training Naive Bayes...
Naive Bayes Completed

Training Random Forest...
Random Forest Completed

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Completed

Final Results:
                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
0  Logistic Regression  0.817504  0.850060   0.713525  0.446072  0.548955   
1        Decision Tree  0.807393  0.742255   0.613333  0.612517  0.612925   
2                  KNN  0.818996  0.849777   0.652985  0.582557  0.615764   
3          Naive Bayes  0.786508  0.828922   0.656891  0.298269  0.410256   
4        Random Forest  0.852478  0.912829   0.798828  0.544607  0.647664   
5              XGBoost  0.861595  0.920449   0.763636  0.643142  0.698229   

        MCC  
0  0.461262  
1  0.484726  
2  0.499267  
3  0.336790  
4  0.574951  
5  0.613069  


['model/label_encoders.pkl']

In [3]:
import shutil
from google.colab import files

# Zip the model folder
shutil.make_archive("model", 'zip', "model")

# Download the zip file
files.download("model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>