In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [2]:
data=pd.read_csv("archive/Pokemon.csv")

In [3]:
data

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [5]:
data.fillna("None", inplace=True)
data['Mega_Evolution']=np.where(data['Name'].str.contains('Mega'), 'Yes', 'No')
data.drop(['#','Name'], axis=1, inplace=True)

In [6]:
data

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Mega_Evolution
0,Grass,Poison,318,45,49,49,65,65,45,1,False,No
1,Grass,Poison,405,60,62,63,80,80,60,1,False,No
2,Grass,Poison,525,80,82,83,100,100,80,1,False,No
3,Grass,Poison,625,80,100,123,122,120,80,1,False,Yes
4,Fire,,309,39,52,43,60,50,65,1,False,No
...,...,...,...,...,...,...,...,...,...,...,...,...
795,Rock,Fairy,600,50,100,150,100,150,50,6,True,No
796,Rock,Fairy,700,50,160,110,160,110,110,6,True,Yes
797,Psychic,Ghost,600,80,110,60,150,130,70,6,True,No
798,Psychic,Dark,680,80,160,60,170,130,80,6,True,No


In [7]:
categorical_cols = ['Type 1', 'Type 2', 'Legendary']
data[categorical_cols] = data[categorical_cols].astype(str) 

encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(data[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_cols))
data = data.drop(columns=categorical_cols).reset_index(drop=True)
data = pd.concat([data, encoded_df], axis=1)
data['Mega_Evolution'] = data['Mega_Evolution'].map({'No': 0, 'Yes': 1})

In [8]:
data

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Mega_Evolution,Type 1_Dark,...,Type 2_Ground,Type 2_Ice,Type 2_None,Type 2_Normal,Type 2_Poison,Type 2_Psychic,Type 2_Rock,Type 2_Steel,Type 2_Water,Legendary_True
0,318,45,49,49,65,65,45,1,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,405,60,62,63,80,80,60,1,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,525,80,82,83,100,100,80,1,0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,625,80,100,123,122,120,80,1,1,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,309,39,52,43,60,50,65,1,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,600,50,100,150,100,150,50,6,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
796,700,50,160,110,160,110,110,6,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
797,600,80,110,60,150,130,70,6,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
798,680,80,160,60,170,130,80,6,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
X = data.drop(columns=['Mega_Evolution'])
y=data['Mega_Evolution']

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [11]:
print(" y train distribution:\n", y_train.value_counts())
print("y test distribution:\n", y_test.value_counts())

 y train distribution:
 Mega_Evolution
0    601
1     39
Name: count, dtype: int64
y test distribution:
 Mega_Evolution
0    150
1     10
Name: count, dtype: int64


In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
svm = SVC(kernel='linear', C=100)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [48]:
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.96875


In [29]:
rf = RandomForestClassifier(n_estimators=10, random_state=55)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [30]:
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.9625


In [28]:
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print("\nXGBoost Accuracy:", accuracy_score(y_test, xgb_pred))


XGBoost Accuracy: 0.96875
