## Load Dataset

In [None]:
import pandas as pd

file_path = '/adult.csv'
df = pd.read_csv(file_path)
display(df.head())

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [24]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


## Initial Data Inspection

Eexamine the unique values and counts for these `object` columns to understand their distribution and identify any '?' values, which often represent missing data in this dataset.

In [25]:
for col in df.select_dtypes(include='object').columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())
    print(f"Missing values ('?'): {(df[col] == '?').sum()}")


Column: workclass
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
Missing values ('?'): 1836

Column: education
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64
Missing values ('?'): 0

Column: marital.status
marital.status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           

## Handle Missing Values

In [26]:
import numpy as np

# Replace '?' with NaN
for col in ['workclass', 'occupation', 'native.country']:
    df[col] = df[col].replace('?', np.nan)

# Impute missing values with the mode
for col in ['workclass', 'occupation', 'native.country']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that '?' values are gone and NaN values are imputed
for col in df.select_dtypes(include='object').columns:
    print(f"\nColumn: {col}")
    print(df[col].value_counts())
    print(f"Missing values ('?'): {(df[col] == '?').sum()}")
    print(f"NaN values: {df[col].isnull().sum()}")


Column: workclass
workclass
Private             24532
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
Missing values ('?'): 0
NaN values: 0

Column: education
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64
Missing values ('?'): 0
NaN values: 0

Column: marital.status
marital.status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           2

## Analyze Categorical Features

## One-Hot Encode Categorical Features and Split Data

In [27]:
# One-hot encode categorical features
df_encoded = pd.get_dummies(df, drop_first=True)

# Encode the target variable 'income'
df_encoded['income_>50K'] = df_encoded['income_>50K'].astype(int)

# Display the first few rows of the encoded DataFrame and its info
display(df_encoded.head())
print(df_encoded.info())

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,...,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,income_>50K
0,90,77053,9,0,4356,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,0
1,82,132870,9,0,4356,18,False,False,True,False,...,False,False,False,False,False,False,True,False,False,0
2,66,186061,10,0,4356,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,0
3,54,140359,4,0,3900,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,0
4,41,264663,10,0,3900,40,False,False,True,False,...,False,False,False,False,False,False,True,False,False,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 98 columns):
 #   Column                                     Non-Null Count  Dtype
---  ------                                     --------------  -----
 0   age                                        32561 non-null  int64
 1   fnlwgt                                     32561 non-null  int64
 2   education.num                              32561 non-null  int64
 3   capital.gain                               32561 non-null  int64
 4   capital.loss                               32561 non-null  int64
 5   hours.per.week                             32561 non-null  int64
 6   workclass_Local-gov                        32561 non-null  bool 
 7   workclass_Never-worked                     32561 non-null  bool 
 8   workclass_Private                          32561 non-null  bool 
 9   workclass_Self-emp-inc                     32561 non-null  bool 
 10  workclass_Self-emp-not-inc                 325

In [28]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df_encoded.drop('income_>50K', axis=1)
y = df_encoded['income_>50K']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (26048, 97)
X_test shape: (6513, 97)
y_train shape: (26048,)
y_test shape: (6513,)


## Implement Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42, solver='liblinear') # 'liblinear' solver is good for small datasets and handles L1/L2 regularization
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.7973

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.97      0.88      4945
           1       0.72      0.26      0.38      1568

    accuracy                           0.80      6513
   macro avg       0.76      0.61      0.63      6513
weighted avg       0.78      0.80      0.76      6513


Confusion Matrix:
 [[4789  156]
 [1164  404]]


## Implement Decision Tree Classifier

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Decision Tree Classifier model
dtc_model = DecisionTreeClassifier(random_state=42)
dtc_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dtc = dtc_model.predict(X_test)

# Evaluate the model
print(f"Decision Tree Classifier Accuracy: {accuracy_score(y_test, y_pred_dtc):.4f}")
print("\nDecision Tree Classifier Classification Report:\n", classification_report(y_test, y_pred_dtc))
print("\nDecision Tree Classifier Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dtc))

Decision Tree Classifier Accuracy: 0.8133

Decision Tree Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88      4945
           1       0.61      0.64      0.62      1568

    accuracy                           0.81      6513
   macro avg       0.74      0.75      0.75      6513
weighted avg       0.82      0.81      0.81      6513


Decision Tree Classifier Confusion Matrix:
 [[4301  644]
 [ 572  996]]


## Implement K-Nearest Neighbor Classifier

In [31]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the K-Nearest Neighbor Classifier model
# Using n_neighbors=5 as a starting point, this can be tuned later
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
print(f"K-Nearest Neighbor Classifier Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print("\nK-Nearest Neighbor Classifier Classification Report:\n", classification_report(y_test, y_pred_knn))
print("\nK-Nearest Neighbor Classifier Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

K-Nearest Neighbor Classifier Accuracy: 0.7746

K-Nearest Neighbor Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.92      0.86      4945
           1       0.56      0.31      0.40      1568

    accuracy                           0.77      6513
   macro avg       0.68      0.62      0.63      6513
weighted avg       0.75      0.77      0.75      6513


K-Nearest Neighbor Classifier Confusion Matrix:
 [[4556  389]
 [1079  489]]


## Implement Gaussian Naive Bayes Classifier

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Gaussian Naive Bayes model
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gnb = gnb_model.predict(X_test)

# Evaluate the model
print(f"Gaussian Naive Bayes Classifier Accuracy: {accuracy_score(y_test, y_pred_gnb):.4f}")
print("\nGaussian Naive Bayes Classifier Classification Report:\n", classification_report(y_test, y_pred_gnb))
print("\nGaussian Naive Bayes Classifier Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))

Gaussian Naive Bayes Classifier Accuracy: 0.7918

Gaussian Naive Bayes Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.95      0.87      4945
           1       0.65      0.29      0.40      1568

    accuracy                           0.79      6513
   macro avg       0.73      0.62      0.64      6513
weighted avg       0.77      0.79      0.76      6513


Gaussian Naive Bayes Classifier Confusion Matrix:
 [[4697  248]
 [1108  460]]


## Implement Random Forest Classifier

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the Random Forest Classifier model
# Using a reasonable number of estimators (trees) as a starting point
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
print(f"Random Forest Classifier Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print("\nRandom Forest Classifier Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nRandom Forest Classifier Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Classifier Accuracy: 0.8518

Random Forest Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.90      4945
           1       0.73      0.61      0.67      1568

    accuracy                           0.85      6513
   macro avg       0.81      0.77      0.78      6513
weighted avg       0.85      0.85      0.85      6513


Random Forest Classifier Confusion Matrix:
 [[4590  355]
 [ 610  958]]


## Implement XGBoost Classifier

In [34]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize and train the XGBoost Classifier model
# Using a reasonable set of parameters as a starting point
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print(f"XGBoost Classifier Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("\nXGBoost Classifier Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("\nXGBoost Classifier Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Classifier Accuracy: 0.8689

XGBoost Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92      4945
           1       0.77      0.65      0.71      1568

    accuracy                           0.87      6513
   macro avg       0.83      0.79      0.81      6513
weighted avg       0.86      0.87      0.87      6513


XGBoost Classifier Confusion Matrix:
 [[4637  308]
 [ 546 1022]]


## Compare All Model Evaluation Metrics

In [35]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef
import pandas as pd

# Initialize a dictionary to store results
results = {}

# --- Logistic Regression ---
# y_pred for LR is already available as 'y_pred'
# y_pred_proba for AUC
y_pred_proba_lr = model.predict_proba(X_test)[:, 1]
results['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_lr),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1 Score': f1_score(y_test, y_pred),
    'MCC Score': matthews_corrcoef(y_test, y_pred)
}

# --- Decision Tree Classifier ---
y_pred_proba_dtc = dtc_model.predict_proba(X_test)[:, 1]
results['Decision Tree Classifier'] = {
    'Accuracy': accuracy_score(y_test, y_pred_dtc),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_dtc),
    'Precision': precision_score(y_test, y_pred_dtc),
    'Recall': recall_score(y_test, y_pred_dtc),
    'F1 Score': f1_score(y_test, y_pred_dtc),
    'MCC Score': matthews_corrcoef(y_test, y_pred_dtc)
}

# --- K-Nearest Neighbor Classifier ---
y_pred_proba_knn = knn_model.predict_proba(X_test)[:, 1]
results['K-Nearest Neighbor'] = {
    'Accuracy': accuracy_score(y_test, y_pred_knn),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_knn),
    'Precision': precision_score(y_test, y_pred_knn),
    'Recall': recall_score(y_test, y_pred_knn),
    'F1 Score': f1_score(y_test, y_pred_knn),
    'MCC Score': matthews_corrcoef(y_test, y_pred_knn)
}

# --- Gaussian Naive Bayes Classifier ---
y_pred_proba_gnb = gnb_model.predict_proba(X_test)[:, 1]
results['Gaussian Naive Bayes'] = {
    'Accuracy': accuracy_score(y_test, y_pred_gnb),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_gnb),
    'Precision': precision_score(y_test, y_pred_gnb),
    'Recall': recall_score(y_test, y_pred_gnb),
    'F1 Score': f1_score(y_test, y_pred_gnb),
    'MCC Score': matthews_corrcoef(y_test, y_pred_gnb)
}

# --- Random Forest Classifier ---
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]
results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1 Score': f1_score(y_test, y_pred_rf),
    'MCC Score': matthews_corrcoef(y_test, y_pred_rf)
}

# --- XGBoost Classifier ---
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
results['XGBoost'] = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'AUC Score': roc_auc_score(y_test, y_pred_proba_xgb),
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1 Score': f1_score(y_test, y_pred_xgb),
    'MCC Score': matthews_corrcoef(y_test, y_pred_xgb)
}

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results).T.round(4)
display(results_df)

Unnamed: 0,Accuracy,AUC Score,Precision,Recall,F1 Score,MCC Score
Logistic Regression,0.7973,0.5949,0.7214,0.2577,0.3797,0.3448
Decision Tree Classifier,0.8133,0.7525,0.6073,0.6352,0.6209,0.4974
K-Nearest Neighbor,0.7746,0.6757,0.5569,0.3119,0.3998,0.2919
Gaussian Naive Bayes,0.7918,0.8268,0.6497,0.2934,0.4042,0.3341
Random Forest,0.8518,0.8999,0.7296,0.611,0.665,0.5746
XGBoost,0.8689,0.9236,0.7684,0.6518,0.7053,0.6252
