In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [6]:
# files_and_dirs = os.listdir("/content/drive/MyDrive")

# # Print the list
# for item in files_and_dirs:
#     print(item)

In [5]:
data = pd.read_csv("/content/drive/MyDrive/adult.data.csv")

In [9]:
data.replace("?", np.nan, inplace=True)
data.dropna(inplace=True)

label_encoders = {}
for column in data.columns:
    if data[column].dtype == object:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

In [21]:
# num_columns = data.shape[1]
# print(f"Number of columns in the DataFrame: {num_columns}")

In [20]:
# num_rows = y_test.shape[0]
# print(f"Number of rows in the DataFrame: {num_rows}")

In [10]:
X = data.drop("income", axis=1)
y = data["income"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [23]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [24]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    joblib.dump(model, f"{name}.pkl")

print("\nModel Evaluation Results:\n")
for model_name, metrics in results.items():
    print(model_name)
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("-" * 40)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model Evaluation Results:

Logistic Regression
Accuracy: 0.8230
Precision: 0.7442
Recall: 0.4601
F1 Score: 0.5687
AUC: 0.8598
MCC: 0.4863
----------------------------------------
Decision Tree
Accuracy: 0.8087
Precision: 0.6218
Recall: 0.6275
F1 Score: 0.6246
AUC: 0.7489
MCC: 0.4963
----------------------------------------
KNN
Accuracy: 0.8251
Precision: 0.6761
Recall: 0.5961
F1 Score: 0.6336
AUC: 0.8588
MCC: 0.5211
----------------------------------------
Naive Bayes
Accuracy: 0.7984
Precision: 0.7099
Recall: 0.3471
F1 Score: 0.4662
AUC: 0.8595
MCC: 0.3946
----------------------------------------
Random Forest
Accuracy: 0.8558
Precision: 0.7466
Recall: 0.6529
F1 Score: 0.6967
AUC: 0.9075
MCC: 0.6050
----------------------------------------
XGBoost
Accuracy: 0.8684
Precision: 0.7730
Recall: 0.6810
F1 Score: 0.7241
AUC: 0.9261
MCC: 0.6404
----------------------------------------
