# Dataset Preparation


In [115]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings

warnings.filterwarnings("ignore")

seed = 1855

## Data Collection


In [116]:
dataset_name = "penguins"

dataset = pd.DataFrame(sns.load_dataset(dataset_name)) 
dataset.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## Handling missing values


In [117]:
from pandas.api.types import is_numeric_dtype

dataset = dataset.apply(lambda x: 
  x.fillna(x.median()) if is_numeric_dtype(x) 
  else x.fillna(x.mode().iloc[0]))


dataset.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## Encoding categorical features with One-Hot Encoding


In [118]:
categorical_features = [col for col in dataset.columns if not is_numeric_dtype(dataset[col]) and col != "sex"]
dataset = pd.get_dummies(dataset, columns=categorical_features)

dataset.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,Male,True,False,False,False,False,True
1,39.5,17.4,186.0,3800.0,Female,True,False,False,False,False,True
2,40.3,18.0,195.0,3250.0,Female,True,False,False,False,False,True
3,44.45,17.3,197.0,4050.0,Male,True,False,False,False,False,True
4,36.7,19.3,193.0,3450.0,Female,True,False,False,False,False,True


## Encoding binary class label


In [119]:
dataset.sex = dataset.sex.map(lambda x: 1 if x == 'Male' else -1)

# for convenience, I will move the target column to the end of the dataframe
column_names = dataset.columns.tolist()
column_names.insert(len(column_names), column_names.pop(column_names.index('sex')))

dataset = dataset[column_names]
dataset.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex
0,39.1,18.7,181.0,3750.0,True,False,False,False,False,True,1
1,39.5,17.4,186.0,3800.0,True,False,False,False,False,True,-1
2,40.3,18.0,195.0,3250.0,True,False,False,False,False,True,-1
3,44.45,17.3,197.0,4050.0,True,False,False,False,False,True,1
4,36.7,19.3,193.0,3450.0,True,False,False,False,False,True,-1


# Model Building


In [120]:
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.neighbors import *

## Splitting the dataset into training and testing sets


In [121]:
x = dataset.drop('sex', axis=1)

y = dataset.sex

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [122]:
def evaluate(expected, predicted):
    accuracy = accuracy_score(expected, predicted)
    precision = precision_score(expected, predicted)
    recall = recall_score(expected, predicted)
    f1 = f1_score(expected, predicted)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')

## Model Building


In [123]:
model = RandomForestClassifier(random_state=seed)

## Model Evaluation


In [124]:
cross_validation_result = cross_validate(model, x_train, y_train, cv=10, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))

print("Cross validation results:")

print(f"Accuracy (mean): {cross_validation_result['test_accuracy'].mean():.2f} +- {cross_validation_result['test_accuracy'].std():.2f}")
print(f"Precision (mean): {cross_validation_result['test_precision'].mean():.2f} +- {cross_validation_result['test_precision'].std():.2f}")
print(f"Recall (mean): {cross_validation_result['test_recall'].mean():.2f} +- {cross_validation_result['test_recall'].std():.2f}")
print(f"F1 (mean): {cross_validation_result['test_f1'].mean():.2f} +- {cross_validation_result['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {cross_validation_result['test_roc_auc'].mean():.2f} +- {cross_validation_result['test_roc_auc'].std():.2f}")



Cross validation results:
Accuracy (mean): 0.88 +- 0.04
Precision (mean): 0.89 +- 0.06
Recall (mean): 0.89 +- 0.08
F1 (mean): 0.89 +- 0.04
ROC AUC (mean): 0.94 +- 0.05


### Cross Validation with KFold

In [125]:
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

model = RandomForestClassifier(random_state=seed)

results = cross_validate (model, x_train, y_train, cv=kfold, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))

print("Cross validation results:")
print(f"Accuracy (mean): {results['test_accuracy'].mean():.2f} +- {results['test_accuracy'].std():.2f}")
print(f"Precision (mean): {results['test_precision'].mean():.2f} +- {results['test_precision'].std():.2f}")
print(f"Recall (mean): {results['test_recall'].mean():.2f} +- {results['test_recall'].std():.2f}")
print(f"F1 (mean): {results['test_f1'].mean():.2f} +- {results['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {results['test_roc_auc'].mean():.2f} +- {results['test_roc_auc'].std():.2f}")

Cross validation results:
Accuracy (mean): 0.87 +- 0.06
Precision (mean): 0.87 +- 0.07
Recall (mean): 0.88 +- 0.07
F1 (mean): 0.87 +- 0.06
ROC AUC (mean): 0.92 +- 0.04


### Cross Validatoin with Stratified KFold

In [126]:
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
model = RandomForestClassifier(random_state=seed)

results = cross_validate(model, x_train, y_train, cv=kfold, scoring=('accuracy', 'precision', 'recall', 'f1', 'roc_auc'))

print("Cross validation results:")
print(f"Accuracy (mean): {results['test_accuracy'].mean():.2f} +- {results['test_accuracy'].std():.2f}")
print(f"Precision (mean): {results['test_precision'].mean():.2f} +- {results['test_precision'].std():.2f}")
print(f"Recall (mean): {results['test_recall'].mean():.2f} +- {results['test_recall'].std():.2f}")
print(f"F1 (mean): {results['test_f1'].mean():.2f} +- {results['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {results['test_roc_auc'].mean():.2f} +- {results['test_roc_auc'].std():.2f}")


Cross validation results:
Accuracy (mean): 0.87 +- 0.06
Precision (mean): 0.87 +- 0.07
Recall (mean): 0.88 +- 0.07
F1 (mean): 0.87 +- 0.06
ROC AUC (mean): 0.92 +- 0.04


## Model Building with Hyperparameter Tuning


In [127]:
params = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
}

model = RandomForestClassifier(random_state=seed)
kfold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
grid_search = GridSearchCV(model, param_grid=params, cv=kfold, scoring='roc_auc')

grid_search.fit(x_train, y_train)

pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,0.106531,0.00855,0.005244,0.000166,10.0,100,"{'max_depth': 10, 'n_estimators': 100}",0.938462,1.0,0.948718,0.948718,0.887179,0.967033,0.983516,0.923077,0.870879,0.903846,0.937143,0.039317,1
3,0.22888,0.023189,0.008956,0.001141,,200,"{'max_depth': None, 'n_estimators': 200}",0.938462,1.0,0.948718,0.953846,0.871795,0.967033,0.983516,0.925824,0.879121,0.901099,0.936941,0.040588,2
15,0.205412,0.016013,0.008283,0.000416,20.0,200,"{'max_depth': 20, 'n_estimators': 200}",0.938462,1.0,0.948718,0.953846,0.871795,0.967033,0.983516,0.925824,0.879121,0.901099,0.936941,0.040588,2
7,0.193592,0.009223,0.008108,0.000207,5.0,200,"{'max_depth': 5, 'n_estimators': 200}",0.928205,0.994872,0.923077,0.953846,0.876923,0.950549,1.0,0.934066,0.89011,0.917582,0.936923,0.037749,4
9,0.051507,0.002985,0.003944,0.000518,10.0,50,"{'max_depth': 10, 'n_estimators': 50}",0.935897,0.994872,0.953846,0.948718,0.887179,0.967033,0.983516,0.928571,0.85989,0.903846,0.936337,0.040535,5



## Model Evaluation


In [128]:
best_model = grid_search.best_estimator_

print(f"Best model: {best_model}")

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

Best model: RandomForestClassifier(max_depth=10, random_state=1855)
Accuracy: 0.96
Precision: 0.97
Recall: 0.94
F1: 0.96


# Comparing Different Models Performance

In [135]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

models = {
    'Random Forest': RandomForestClassifier(random_state=seed),
    'Logistic Regression': LogisticRegression(random_state=seed),
    'Decision Tree': DecisionTreeClassifier(random_state=seed),
    'Gradient Boosting': GradientBoostingClassifier(random_state=seed),
    'KNN': KNeighborsClassifier()
}

cross_validation_results = {}

for model_name, model in models.items():
    cross_validation_results[model_name]  = cross_val_score(model, x_train, y_train, cv=kfold, scoring='precision')


print("Cross validation results:")
cross_validation_df = pd.DataFrame(cross_validation_results).transpose()

cross_validation_df['mean'] = cross_validation_df.mean(axis=1)
cross_validation_df['std'] = cross_validation_df.std(axis=1)

cross_validation_df.sort_values('mean', ascending=False)


Cross validation results:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
Gradient Boosting,0.785714,0.857143,0.941176,0.866667,0.933333,0.846154,0.857143,0.866667,0.933333,1.0,0.888733,0.058741
Random Forest,0.8,0.846154,1.0,0.8125,0.9375,0.785714,0.785714,0.866667,0.933333,0.916667,0.868425,0.071144
KNN,0.714286,0.888889,1.0,0.8125,0.75,0.909091,0.909091,0.769231,1.0,0.666667,0.841975,0.11082
Logistic Regression,0.6875,0.857143,0.941176,0.8125,0.933333,0.785714,0.846154,0.857143,0.823529,0.642857,0.818705,0.090075
Decision Tree,0.8,0.75,0.941176,0.722222,0.833333,0.833333,0.75,0.875,0.769231,0.846154,0.812045,0.063603
