# Pokemon Classification

In this notebook, we are going to make a model that can predict a Pokemon's type (Water, Fire, etc.) based on its stats!

In [1]:
import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Let's download our dataset from Kaggle
path = kagglehub.dataset_download("rounakbanik/pokemon")
df = pd.read_csv(f"{path}/pokemon.csv")
df.head()



Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [3]:
# Checkout column types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non

In [4]:
# Let's start by removing some columns that are likely not very useful for us:
df_clean = df.drop(columns=["abilities", "capture_rate", "classfication", "japanese_name", "name", "pokedex_number", "type2", "generation", "is_legendary"])
df_clean.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,defense,experience_growth,height_m,hp,percentage_male,sp_attack,sp_defense,speed,type1,weight_kg
0,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,49,1059860,0.7,45,88.1,65,65,45,grass,6.9
1,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,63,1059860,1.0,60,88.1,80,80,60,grass,13.0
2,1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,0.25,...,123,1059860,2.0,80,88.1,122,120,80,grass,100.0
3,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,43,1059860,0.6,39,88.1,60,50,65,fire,8.5
4,0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,0.5,...,58,1059860,1.1,58,88.1,80,65,80,fire,19.0


In [5]:
# Let's check for any missing values
df_clean.isnull().mean().loc[lambda x: x > 0]

height_m           0.024969
percentage_male    0.122347
weight_kg          0.024969
dtype: float64

In [6]:
# There are few missing values for height, weight, and % male. Let's just use the mean of the pokemon types
df_clean = df_clean.fillna(df_clean.groupby('type1').transform('mean'))
df_clean.isnull().mean().loc[lambda x: x > 0]

Series([], dtype: float64)

In [7]:
# Perform one-hot encoding on categorical target labels
df_clean = pd.get_dummies(df_clean, prefix="type", columns=["type1"]).astype(int)
df_clean.head()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,type_ghost,type_grass,type_ground,type_ice,type_normal,type_poison,type_psychic,type_rock,type_steel,type_water
0,1,1,1,0,0,0,2,2,1,0,...,0,1,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,2,2,1,0,...,0,1,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,2,2,1,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,1,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.model_selection import train_test_split

y = df_clean.loc[:, df_clean.columns.str.startswith('type_')]
X = df_clean.loc[:, ~df_clean.columns.str.startswith('type_')]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [9]:
# Now let's perform scaling to ensure data is on the same range
# And Let's start with a simple KNN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

estimator = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier(n_neighbors=4, weights='distance', p=1))
])

In [10]:
# Let's start with a simple KNN
estimator.fit(X_train, y_train)

In [11]:
from sklearn.metrics import classification_report, accuracy_score
y_hat = estimator.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_hat)}")
print(classification_report(y_test, y_hat, target_names=y.columns, zero_division=0))

Accuracy: 0.8509316770186336
               precision    recall  f1-score   support

     type_bug       0.93      0.87      0.90        15
    type_dark       1.00      0.62      0.77         8
  type_dragon       0.80      0.80      0.80         5
type_electric       0.86      1.00      0.92         6
   type_fairy       1.00      0.67      0.80         3
type_fighting       1.00      1.00      1.00         8
    type_fire       0.88      0.88      0.88         8
  type_flying       0.00      0.00      0.00         1
   type_ghost       0.67      1.00      0.80         4
   type_grass       1.00      0.96      0.98        23
  type_ground       0.50      0.60      0.55         5
     type_ice       1.00      0.67      0.80         6
  type_normal       0.90      0.95      0.92        19
  type_poison       0.80      0.80      0.80         5
 type_psychic       0.86      0.75      0.80         8
    type_rock       0.80      0.67      0.73        12
   type_steel       0.50      1.00 

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier

# Define the pipeline
estimator = Pipeline([
    ('sclr', StandardScaler()),
    ('clf', MultiOutputClassifier(LogisticRegression()))  # Wrap classifier
])

# Define parameter grid with 'estimator__' prefix for the underlying classifier
param_grid = [
    {
        'clf': [MultiOutputClassifier(LogisticRegression())],
        'clf__estimator__max_iter': [500, 1000, 5000],
        'clf__estimator__C': [0.01, 0.1, 1, 10, 100],
    },
    {
        'clf': [MultiOutputClassifier(RandomForestClassifier())],
        'clf__estimator__n_estimators': [50, 100, 500],
        'clf__estimator__min_samples_split': [2, 3, 5],
    },
    {
        'clf': [MultiOutputClassifier(SVC())],
        'clf__estimator__degree': [2, 3, 5],
        'clf__estimator__C': [0.01, 0.1, 1, 10, 100]
    },
]

# GridSearchCV
gs = GridSearchCV(estimator, param_grid, scoring='accuracy', refit=True, n_jobs=4)
gs.fit(X_train, y_train)


In [14]:
gs.best_params_

{'clf': MultiOutputClassifier(estimator=SVC()),
 'clf__estimator__C': 100,
 'clf__estimator__degree': 2}

In [17]:
estimator = gs.best_estimator_
y_hat = estimator.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_hat))
print(classification_report(y_test, y_hat, target_names=y.columns, zero_division=0))

Accuracy: 0.8322981366459627
               precision    recall  f1-score   support

     type_bug       0.93      0.87      0.90        15
    type_dark       1.00      0.88      0.93         8
  type_dragon       1.00      0.80      0.89         5
type_electric       1.00      1.00      1.00         6
   type_fairy       1.00      0.67      0.80         3
type_fighting       0.88      0.88      0.88         8
    type_fire       1.00      0.62      0.77         8
  type_flying       0.00      0.00      0.00         1
   type_ghost       0.57      1.00      0.73         4
   type_grass       1.00      0.91      0.95        23
  type_ground       0.67      0.80      0.73         5
     type_ice       0.67      0.33      0.44         6
  type_normal       1.00      1.00      1.00        19
  type_poison       0.80      0.80      0.80         5
 type_psychic       1.00      0.75      0.86         8
    type_rock       1.00      0.92      0.96        12
   type_steel       0.75      1.00 