# Dataset Preparation


### Setup

In [48]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')
seed = 1855

## Data Collection


In [49]:
dataset_name = 'iris'

dataset = pd.DataFrame(sns.load_dataset(dataset_name))

dataset.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Handling missing values


In [50]:
from pandas.api.types import is_numeric_dtype

dataset = dataset.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode()[0]))

dataset.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## Encoding categorical features with One-Hot Encoding


In [51]:
categorical_columns = dataset.select_dtypes(include=['object']).columns.tolist()
categorical_columns.remove('species')

print(categorical_columns)

dataset = pd.get_dummies(dataset, columns=categorical_columns)

['species']
[]


## Encoding binary class label


In [58]:
# In this case, the target is the species which is a ternary column

unique_values = dataset['species'].unique()
mappings = {unique_values[i]: i for i in range(len(unique_values))}

dataset['species'] = dataset['species'].map(mappings)

dataset.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# Model Building


In [95]:
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.svm import *
from sklearn.neighbors import *
from sklearn.naive_bayes import *

## Splitting the dataset into training and testing sets


In [63]:
x  = dataset.drop(columns=['species'])

y = dataset['species']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

## Evaluation function


In [64]:
def evaluate(expected, predicted):
  results = {
    'accuracy': accuracy_score(expected, predicted),
    'precision': precision_score(expected, predicted, average='weighted'),
    'recall': recall_score(expected, predicted, average='weighted'),
    'f1': f1_score(expected, predicted, average='weighted')
  }
  
  for metric, score in results.items():
    print(f'{metric.capitalize()}: {score:.2f}')

## Model Building


In [87]:
model = GradientBoostingClassifier()

## Model Evaluation


### Cross Validation

In [88]:
results = cross_validate(model, x_train, y_train, cv= 10, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=False)

for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')
    
pd.DataFrame(results)

test_accuracy: 0.96 +- 0.06
test_precision_weighted: 0.97 +- 0.04
test_recall_weighted: 0.96 +- 0.06
test_f1_weighted: 0.96 +- 0.06


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted
0,0.201401,0.005725,1.0,1.0,1.0,1.0
1,0.185886,0.010014,0.916667,0.933333,0.916667,0.915344
2,0.188199,0.007167,1.0,1.0,1.0,1.0
3,0.198967,0.005829,0.916667,0.933333,0.916667,0.915344
4,0.19975,0.006959,1.0,1.0,1.0,1.0
5,0.196848,0.005628,1.0,1.0,1.0,1.0
6,0.18595,0.007512,1.0,1.0,1.0,1.0
7,0.176469,0.006694,0.916667,0.933333,0.916667,0.915344
8,0.186712,0.005565,1.0,1.0,1.0,1.0
9,0.173264,0.005663,0.833333,0.9,0.833333,0.826389


### Cross Validation with KFold

In [90]:

k_fold = KFold(n_splits=10, random_state=seed, shuffle=True)
results = cross_validate(model, x_train, y_train, cv= k_fold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=False)

for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')
    
pd.DataFrame(results)

test_accuracy: 0.96 +- 0.06
test_precision_weighted: 0.97 +- 0.04
test_recall_weighted: 0.96 +- 0.06
test_f1_weighted: 0.96 +- 0.06


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted
0,0.205567,0.005714,1.0,1.0,1.0,1.0
1,0.176067,0.005515,0.833333,0.9,0.833333,0.826389
2,0.171116,0.006097,0.916667,0.933333,0.916667,0.916667
3,0.184585,0.005594,1.0,1.0,1.0,1.0
4,0.179585,0.005678,0.916667,0.958333,0.916667,0.926768
5,0.187299,0.006671,1.0,1.0,1.0,1.0
6,0.18903,0.006935,0.916667,0.9375,0.916667,0.919414
7,0.175129,0.006643,1.0,1.0,1.0,1.0
8,0.184957,0.005665,1.0,1.0,1.0,1.0
9,0.186765,0.006259,1.0,1.0,1.0,1.0


### Cross Validation with StratifiedKFold

In [91]:

k_fold = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
results = cross_validate(model, x_train, y_train, cv= k_fold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=False)

for metric, scores in results.items():
  if metric != 'fit_time' and metric != 'score_time':
    print(f'{metric}: {np.mean(scores):.2f} +- {np.std(scores):.2f}')
    
pd.DataFrame(results)

test_accuracy: 0.96 +- 0.06
test_precision_weighted: 0.97 +- 0.04
test_recall_weighted: 0.96 +- 0.06
test_f1_weighted: 0.96 +- 0.06


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_weighted,test_recall_weighted,test_f1_weighted
0,0.183953,0.006317,1.0,1.0,1.0,1.0
1,0.177701,0.005884,0.833333,0.888889,0.833333,0.822222
2,0.17888,0.005666,0.916667,0.933333,0.916667,0.915344
3,0.179448,0.005547,0.916667,0.933333,0.916667,0.915344
4,0.183962,0.006981,1.0,1.0,1.0,1.0
5,0.207906,0.008788,1.0,1.0,1.0,1.0
6,0.237004,0.006,1.0,1.0,1.0,1.0
7,0.205852,0.006153,1.0,1.0,1.0,1.0
8,0.19058,0.00653,1.0,1.0,1.0,1.0
9,0.193597,0.006822,0.916667,0.9375,0.916667,0.916667


## Model Building with Hyperparameter Tuning


In [94]:
params = {
  'max_depth': [3, 5, 7, 9],
  'learning_rate': [0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=k_fold)

grid_search.fit(x_train, y_train)


## Model Evaluation


In [93]:
best_model = grid_search.best_estimator_

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

Accuracy: 0.90
Precision: 0.90
Recall: 0.90
F1: 0.90


# Comparing Different Models Performance

## Model Building

In [97]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier()
}

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
Logistic Regression,1.0,0.916667,0.916667,1.0,0.916667,1.0,1.0,1.0,1.0,0.916667,0.966667,0.040825
SVM,1.0,0.916667,1.0,1.0,0.916667,1.0,0.916667,1.0,1.0,0.916667,0.966667,0.040825
KNN,1.0,1.0,0.916667,0.916667,0.916667,1.0,1.0,1.0,1.0,0.916667,0.966667,0.040825
Naive Bayes,1.0,0.916667,0.916667,0.916667,0.916667,1.0,1.0,1.0,1.0,1.0,0.966667,0.040825
Gradient Boosting,1.0,0.833333,0.916667,0.916667,1.0,1.0,1.0,1.0,1.0,0.916667,0.958333,0.055902
Decision Tree,0.916667,0.833333,0.916667,0.916667,0.916667,1.0,1.0,1.0,0.916667,0.916667,0.933333,0.05


## Model Comparison

In [98]:

results = {}

for name, model in models.items():
    results[name] = cross_val_score(model, x_train, y_train, cv=k_fold, scoring='accuracy')


result_df = pd.DataFrame(results).transpose()

result_df['mean'] = result_df.mean(axis=1)
result_df['std'] = result_df.std(axis=1)
result_df = result_df.sort_values(by=['mean', 'std'], ascending=False)
result_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
Logistic Regression,1.0,0.916667,0.916667,1.0,0.916667,1.0,1.0,1.0,1.0,0.916667,0.966667,0.040825
SVM,1.0,0.916667,1.0,1.0,0.916667,1.0,0.916667,1.0,1.0,0.916667,0.966667,0.040825
KNN,1.0,1.0,0.916667,0.916667,0.916667,1.0,1.0,1.0,1.0,0.916667,0.966667,0.040825
Naive Bayes,1.0,0.916667,0.916667,0.916667,0.916667,1.0,1.0,1.0,1.0,1.0,0.966667,0.040825
Gradient Boosting,1.0,0.833333,0.916667,0.916667,1.0,1.0,1.0,1.0,1.0,0.916667,0.958333,0.055902
Decision Tree,0.916667,0.833333,0.916667,0.916667,0.916667,1.0,1.0,1.0,1.0,0.916667,0.941667,0.053359
