# Dataset Preparation


## Data Collection


In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Disabling warnings
import warnings
warnings.filterwarnings("ignore")

# Load the data
path = './exercises/sklearn/classification-income/income_dataset.csv'
data = pd.read_csv(path)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_greater_than_50k
0,49,Private,153536,Some-college,10,Divorced,Prof-specialty,Not-in-family,White,Male,14084,0,44,United-States,1
1,19,Private,417657,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,50,United-States,-1
2,41,Private,143046,Masters,14,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,-1
3,46,Private,377401,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,1902,70,Canada,1
4,40,Private,224232,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,1


## Handling missing values


In [44]:
from pandas.api.types import is_numeric_dtype

data = data.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode().iloc[0]))
data.describe()


Unnamed: 0,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income_greater_than_50k
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,188394.2,10.108,1119.81,86.988,40.733,-0.496
std,105866.1,2.558688,6156.576929,396.191304,11.746987,0.868757
min,19302.0,1.0,0.0,0.0,5.0,-1.0
25%,122060.8,9.0,0.0,0.0,40.0,-1.0
50%,176733.5,10.0,0.0,0.0,40.0,-1.0
75%,231289.8,12.0,0.0,0.0,45.0,1.0
max,1484705.0,16.0,99999.0,2415.0,99.0,1.0


## Encoding categorical features with One-Hot Encoding


In [45]:
categorical_columns = data.select_dtypes(include=['object']).columns

data = pd.get_dummies(data, columns=categorical_columns)

data.head()

Unnamed: 0,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income_greater_than_50k,age_17,age_18,age_19,age_20,...,native_country_Jamaica,native_country_Laos,native_country_Mexico,native_country_Philippines,native_country_Poland,native_country_Puerto-Rico,native_country_South,native_country_Taiwan,native_country_United-States,native_country_Vietnam
0,153536,10,14084,0,44,1,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,417657,9,0,0,50,-1,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
2,143046,14,0,0,40,-1,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,377401,9,0,1902,70,1,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,224232,13,0,0,40,1,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


## Encoding binary class label


In [46]:
# Alredy encoded in the datasate (column income_greater_than_50k)

# Model Building


In [47]:
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import *

## Splitting the dataset into training and testing sets


In [48]:
seed = 69

x = data.drop('income_greater_than_50k', axis=1)
y = data['income_greater_than_50k']

x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [49]:
def evaluate(expected, predicted):
    print('Accuracy:', accuracy_score(expected, predicted))
    print('Precision:', precision_score(expected, predicted))
    print('Recall:', recall_score(expected, predicted))
    print('F1:', f1_score(expected, predicted))
    print('ROC AUC:', roc_auc_score(expected, predicted))

## Model Building


In [50]:
model = RandomForestClassifier(random_state=seed)

## Model Evaluation


In [51]:
cross_validation_result = cross_validate(model, x_train, y_train, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))
pd.DataFrame(cross_validation_result)

print(f"Cross validation results:")
print(f"Accuracy mean: {cross_validation_result['test_accuracy'].mean():.2f}")
print(f"Precision mean: {cross_validation_result['test_precision'].mean():.2f}")
print(f"Recall mean: {cross_validation_result['test_recall'].mean():.2f}")
print(f"F1 mean: {cross_validation_result['test_f1'].mean():.2f}")
print(f"ROC AUC mean: {cross_validation_result['test_roc_auc'].mean():.2f}")

Cross validation results:
Accuracy mean: 0.78
Precision mean: 0.71
Recall mean: 0.48
F1 mean: 0.53
ROC AUC mean: 0.86


In [52]:
model = RandomForestClassifier(random_state=seed)
k_fold = KFold(n_splits=10, random_state=seed, shuffle=True)
cross_validation_result = cross_validate(model, x_train, y_train, cv=k_fold, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))
pd.DataFrame(cross_validation_result)

print(f"Cross validation with K-Fold results:")
print(f"Accuracy mean: {cross_validation_result['test_accuracy'].mean():.2f}")
print(f"Precision mean: {cross_validation_result['test_precision'].mean():.2f}")
print(f"Recall mean: {cross_validation_result['test_recall'].mean():.2f}")
print(f"F1 mean: {cross_validation_result['test_f1'].mean():.2f}")
print(f"ROC AUC mean: {cross_validation_result['test_roc_auc'].mean():.2f}")

Cross validation with K-Fold results:
Accuracy mean: 0.78
Precision mean: 0.68
Recall mean: 0.46
F1 mean: 0.54
ROC AUC mean: 0.83


In [53]:
model = RandomForestClassifier(random_state=seed)
k_fold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
cross_validation_result = cross_validate(model, x_train, y_train, cv=k_fold, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))
pd.DataFrame(cross_validation_result)

print(f"Cross validation with K-Fold results:")
print(f"Accuracy mean: {cross_validation_result['test_accuracy'].mean():.2f}")
print(f"Precision mean: {cross_validation_result['test_precision'].mean():.2f}")
print(f"Recall mean: {cross_validation_result['test_recall'].mean():.2f}")
print(f"F1 mean: {cross_validation_result['test_f1'].mean():.2f}")
print(f"ROC AUC mean: {cross_validation_result['test_roc_auc'].mean():.2f}")

Cross validation with K-Fold results:
Accuracy mean: 0.78
Precision mean: 0.58
Recall mean: 0.46
F1 mean: 0.50
ROC AUC mean: 0.86


## Model Building with Hyperparameter Tuning




In [56]:
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.2, random_state=seed)


k_fold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)

model = RandomForestClassifier(random_state=seed)
models_parameters = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
}
grid_search = GridSearchCV(model, models_parameters, cv=k_fold, scoring='accuracy')

grid_search.fit(x_train, y_train)

pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score').head()


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
11,0.548844,0.010945,0.020346,0.000474,10.0,200,"{'max_depth': 10, 'n_estimators': 200}",0.8,0.85,0.95,0.9,0.75,0.7,0.75,0.8,0.75,0.9,0.815,0.077621,1
9,0.14456,0.00323,0.009189,0.000299,10.0,50,"{'max_depth': 10, 'n_estimators': 50}",0.8,0.8,0.95,0.9,0.75,0.7,0.75,0.8,0.7,0.9,0.805,0.082006,2
10,0.274584,0.005366,0.012802,0.000264,10.0,100,"{'max_depth': 10, 'n_estimators': 100}",0.8,0.8,0.95,0.9,0.75,0.7,0.75,0.75,0.75,0.9,0.805,0.078899,2
3,0.566792,0.015491,0.021055,0.000775,,200,"{'max_depth': None, 'n_estimators': 200}",0.8,0.85,0.9,0.85,0.7,0.7,0.75,0.8,0.65,0.9,0.79,0.083066,4
15,0.561524,0.029618,0.02087,0.000857,20.0,200,"{'max_depth': 20, 'n_estimators': 200}",0.8,0.85,0.9,0.85,0.7,0.7,0.75,0.75,0.65,0.9,0.785,0.083815,5



## Model with Hyperparameter Tuning Evaluation


In [61]:
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_model = grid_search.best_estimator_

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

Best parameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy: 0.8375
Precision: 0.722972972972973
Recall: 0.5459183673469388
F1: 0.622093023255814
ROC AUC: 0.739018786322476


# Comparing different Model's Performance

In [80]:
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.svm import *
from sklearn.linear_model import *

x_test, x_train, y_test, y_train = train_test_split(x, y, test_size=0.2, random_state=seed)
k_fold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)

models = {
  'RandomForestClassifier': RandomForestClassifier(random_state=seed),
  'DecisionTreeClassifier': DecisionTreeClassifier(random_state=seed),
  'KNeighborsClassifier': KNeighborsClassifier(),
  'LogisticRegression': LogisticRegression(),
  'GradientBoostingClassifier': GradientBoostingClassifier(),
}

cross_validation_result = {}
for model_name, model in models.items():
    cross_validation_result[model_name] = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='precision')

cross_validation_result = pd.DataFrame(cross_validation_result).transpose()

cross_validation_result['mean'] = cross_validation_result.mean(axis = 1)
cross_validation_result['std'] = cross_validation_result.std(axis = 1)

cross_validation_result = cross_validation_result.sort_values(['mean', 'std'], ascending=False)

cross_validation_result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
GradientBoostingClassifier,0.6,0.666667,0.8,1.0,0.5,1.0,0.666667,0.666667,0.5,1.0,0.74,0.189033
DecisionTreeClassifier,0.6,0.5,0.8,0.666667,0.6,0.666667,0.6,0.666667,0.5,0.5,0.61,0.090738
RandomForestClassifier,0.5,0.6,0.8,0.8,0.5,0.5,0.0,0.666667,0.4,1.0,0.576667,0.25865
LogisticRegression,0.25,0.333333,0.666667,1.0,0.666667,0.666667,0.2,0.0,0.4,1.0,0.518333,0.319414
KNeighborsClassifier,1.0,0.0,0.333333,0.4,0.333333,0.5,0.666667,0.5,0.0,0.0,0.373333,0.305432


In [None]:

cross_validation_result['mean'] = cross_validation_result.mean(axis=1)
cross_validation_result['std'] = cross_validation_result.std(axis=1)
cross_validation_result = cross_validation_result.sort_values(['mean', 'std'], ascending=False)

cross_validation_result