# Dataset Preparation


### Setup

In [270]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
seed = 69

## Data Collection


In [271]:
dataset_path = "classification-problems/legendary-pokemon-classification/dataset/pokemon.csv"

df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


## Handling missing values


In [272]:
from pandas.api.types import is_numeric_dtype

# checking if there are any missing values

df = df.apply(lambda x: x.fillna(x.median()) if is_numeric_dtype(x) else x.fillna(x.mode()[0]))


## Encoding categorical features with One-Hot Encoding


In [273]:
categorical_features = df.select_dtypes('object').columns.tolist()

print(categorical_features)

# it seems weird that capture_rate is an object while it should be numeric, let's investigate further

df.capture_rate.unique()
# there seem to be an outlire value containing '30 (Meteorite)255 (Core)', I'm just going to keep one of them
index = df.capture_rate.to_list().index("30 (Meteorite)255 (Core)")
df.capture_rate[index] = '255'

['abilities', 'capture_rate', 'classfication', 'japanese_name', 'name', 'type1', 'type2']


In [274]:
# now i'm going to convert the strings to number and remove it form the categorical features list

df.capture_rate = df.capture_rate.map(lambda x: int(x))

df.capture_rate

categorical_features.remove('capture_rate')

# I'll also ignore name and japanese_name from the one-shot encoding
categorical_features.remove('japanese_name')
categorical_features.remove('name')
print(categorical_features)

['abilities', 'classfication', 'type1', 'type2']


In [275]:
# abilities contains an array, I'm going to make a column for each value of the array


def to_camel_case(string):
  return string.lower().replace(' ', '_').replace('-', '_')

# parse the arrays of abilities which is a string
df.abilities = df.abilities.str.replace(' ', '').str.replace('-', '').str.replace("'", '').str.replace('[', '').str.replace(']', '')


df.abilities = df.abilities.str.split(',')

df.abilities = df.abilities.apply(lambda x: [to_camel_case(i) for i in x])


for ability in df.abilities.explode().unique():
  df[ability] = df.abilities.apply(lambda x: ability in x).astype(int)

df.drop('abilities', axis=1, inplace=True)


In [276]:
categorical_features = df.select_dtypes('object').columns.tolist()


categorical_features.remove('japanese_name')
df.drop(columns='japanese_name', inplace=True)
categorical_features.remove('name')
df.drop(columns='name', inplace=True)


df = pd.get_dummies(df, columns=categorical_features)

df.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,type2_ghost,type2_grass,type2_ground,type2_ice,type2_normal,type2_poison,type2_psychic,type2_rock,type2_steel,type2_water
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,False,False,False,False,False,True,False,False,False,False
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,False,False,False,False,False,True,False,False,False,False
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,False,False,False,False,False,True,False,False,False,False
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,False,False,False,False,False,False,False,False,False,False
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,False,False,False,False,False,False,False,False,False,False


## Encoding binary class label


In [277]:
# for convenience I'm going to convert every binary column to -1 or 1

# Columns True/False
binary_columns = df.columns[(df.max() == 1) & (df.min() == 0)]

df[binary_columns] = df[binary_columns] * 2 - 1

df.head()


Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,type2_ghost,type2_grass,type2_ground,type2_ice,type2_normal,type2_poison,type2_psychic,type2_rock,type2_steel,type2_water
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,-1,-1,-1,-1,-1,1,-1,-1,-1,-1
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [278]:
# Checking Correlation between the target column and the features
correlation = df.corr()['is_legendary'].sort_values(ascending=False)

correlation

# I see a really high correlation with the is_legendary column, I'm going to remove the column with a correlation higher than 0.8

df.drop(columns='base_egg_steps', inplace=True)

# Model Building


In [279]:
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.naive_bayes import *
from sklearn.svm import *
from sklearn.linear_model import *

from sklearn.model_selection import *
from sklearn.metrics import *


## Splitting the dataset into training and testing sets


In [280]:
x = df.drop(columns='is_legendary')
y = df.is_legendary


x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=seed)

## Evaluation function


In [281]:
def evaluate(expected, predicted):
    accuracy = accuracy_score(expected, predicted)
    precision = precision_score(expected, predicted)
    recall = recall_score(expected, predicted)
    f1 = f1_score(expected, predicted)
    
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')

## Model Building


In [282]:
model = KNeighborsClassifier()

## Model Evaluation


### Cross Validation

In [283]:
cv = 10
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cross_validation_result = cross_validate(model, x_train, y_train, cv = cv, scoring = scoring)

print("Cross validation results:")

print(f"Accuracy (mean): {cross_validation_result['test_accuracy'].mean():.2f} +- {cross_validation_result['test_accuracy'].std():.2f}")
print(f"Precision (mean): {cross_validation_result['test_precision'].mean():.2f} +- {cross_validation_result['test_precision'].std():.2f}")
print(f"Recall (mean): {cross_validation_result['test_recall'].mean():.2f} +- {cross_validation_result['test_recall'].std():.2f}")
print(f"F1 (mean): {cross_validation_result['test_f1'].mean():.2f} +- {cross_validation_result['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {cross_validation_result['test_roc_auc'].mean():.2f} +- {cross_validation_result['test_roc_auc'].std():.2f}")



Cross validation results:
Accuracy (mean): 0.95 +- 0.03
Precision (mean): 0.76 +- 0.21
Recall (mean): 0.68 +- 0.23
F1 (mean): 0.70 +- 0.18
ROC AUC (mean): 0.98 +- 0.04


### Cross Validation with KFold

In [284]:
cv = KFold(n_splits=10, random_state=seed, shuffle=True)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cross_validation_result = cross_validate(model, x_train, y_train, cv = cv, scoring = scoring)

print("Cross validation results:")

print(f"Accuracy (mean): {cross_validation_result['test_accuracy'].mean():.2f} +- {cross_validation_result['test_accuracy'].std():.2f}")
print(f"Precision (mean): {cross_validation_result['test_precision'].mean():.2f} +- {cross_validation_result['test_precision'].std():.2f}")
print(f"Recall (mean): {cross_validation_result['test_recall'].mean():.2f} +- {cross_validation_result['test_recall'].std():.2f}")
print(f"F1 (mean): {cross_validation_result['test_f1'].mean():.2f} +- {cross_validation_result['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {cross_validation_result['test_roc_auc'].mean():.2f} +- {cross_validation_result['test_roc_auc'].std():.2f}")

Cross validation results:
Accuracy (mean): 0.96 +- 0.03
Precision (mean): 0.75 +- 0.15
Recall (mean): 0.75 +- 0.23
F1 (mean): 0.74 +- 0.17
ROC AUC (mean): 0.98 +- 0.02


### Cross Validation with StratifiedKFold

In [285]:
cv = StratifiedKFold(n_splits=10, random_state=seed, shuffle=True)
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
cross_validation_result = cross_validate(model, x_train, y_train, cv = cv, scoring = scoring)

print("Cross validation results:")

print(f"Accuracy (mean): {cross_validation_result['test_accuracy'].mean():.2f} +- {cross_validation_result['test_accuracy'].std():.2f}")
print(f"Precision (mean): {cross_validation_result['test_precision'].mean():.2f} +- {cross_validation_result['test_precision'].std():.2f}")
print(f"Recall (mean): {cross_validation_result['test_recall'].mean():.2f} +- {cross_validation_result['test_recall'].std():.2f}")
print(f"F1 (mean): {cross_validation_result['test_f1'].mean():.2f} +- {cross_validation_result['test_f1'].std():.2f}")
print(f"ROC AUC (mean): {cross_validation_result['test_roc_auc'].mean():.2f} +- {cross_validation_result['test_roc_auc'].std():.2f}")

Cross validation results:
Accuracy (mean): 0.95 +- 0.02
Precision (mean): 0.75 +- 0.15
Recall (mean): 0.74 +- 0.21
F1 (mean): 0.72 +- 0.14
ROC AUC (mean): 0.98 +- 0.03


## Model Building with Hyperparameter Tuning


In [286]:
model = KNeighborsClassifier()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
params = {
  'n_neighbors': [3,5,13],
  'leaf_size': [30, 20, 40]
}

grid_search = GridSearchCV(model, cv=cv, param_grid=params, scoring='f1' )

grid_search.fit(x_train, y_train)

print(f"Best params : {grid_search.best_params_}")

Best params : {'leaf_size': 30, 'n_neighbors': 3}



## Model Evaluation


In [287]:
best_model = grid_search.best_estimator_

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

evaluate(y_test, y_pred)

Accuracy: 0.96
Precision: 0.76
Recall: 0.94
F1: 0.84


# Comparing Different Models Performance

## Model Building

In [288]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

models = {
    'Random Forest': RandomForestClassifier(random_state=seed),
    'Logistic Regression': LogisticRegression(random_state=seed),
    'Decision Tree': DecisionTreeClassifier(random_state=seed),
    'Gradient Boosting': GradientBoostingClassifier(random_state=seed),
    'KNN': KNeighborsClassifier(),
    'Perceptron': Perceptron(),
}

## Model Comparison

In [290]:
cross_validation_results = {}

for model_name, model in models.items():
    cross_validation_results[model_name]  = cross_val_score(model, x_train, y_train, cv=kfold, scoring='accuracy')


print("Cross validation results:")
cross_validation_df = pd.DataFrame(cross_validation_results).transpose()

cross_validation_df['mean'] = cross_validation_df.mean(axis=1)
cross_validation_df['std'] = cross_validation_df.std(axis=1)

cross_validation_df.sort_values('mean', ascending=False)

Cross validation results:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,mean,std
Gradient Boosting,1.0,0.984375,0.984375,1.0,1.0,1.0,0.984375,1.0,1.0,0.96875,0.992188,0.010482
Decision Tree,1.0,0.984375,0.96875,1.0,1.0,1.0,0.984375,1.0,1.0,0.96875,0.990625,0.0125
Random Forest,0.96875,0.96875,0.984375,1.0,1.0,0.984375,0.984375,0.9375,1.0,0.96875,0.979688,0.018554
Logistic Regression,0.984375,0.96875,0.921875,0.953125,0.96875,0.984375,0.96875,0.953125,0.984375,0.9375,0.9625,0.02001
KNN,1.0,0.953125,0.9375,0.953125,0.984375,0.953125,1.0,0.890625,0.9375,0.953125,0.95625,0.031093
Perceptron,0.921875,0.921875,0.078125,0.9375,0.9375,0.90625,0.9375,0.828125,0.9375,0.078125,0.748437,0.336605
