In [1]:
#  import required libraries
import pandas as pd
import numpy as np
import tensorflow as tf

In [66]:
#  read data
file=r'Pokemon.csv'
data=pd.read_csv(file)

In [67]:
data.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,Gen 1,False
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,Gen 1,False
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,Gen 1,False
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,Gen 1,False
4,Charmander,Fire,,309,39,52,43,60,50,65,Gen 1,False


In [68]:
data['Legendary'].value_counts(normalize=True)

Legendary
False    0.91875
True     0.08125
Name: proportion, dtype: float64

In [69]:
data['Legendary'].dtype

dtype('bool')

### Data cleaning

In [70]:
#  check missing values
data.isna().sum()

Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [71]:
#  fill missing values
data['Type 2']=data['Type 2'].fillna('missing')

In [72]:
# check
data.isna().sum()

Name          0
Type 1        0
Type 2        0
Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

In [73]:
data.shape

(800, 12)

In [74]:
#  creating target
data['Legendary']=data['Legendary'].astype(int)

In [75]:
data.head()

Unnamed: 0,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,Gen 1,0
1,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,Gen 1,0
2,Venusaur,Grass,Poison,525,80,82,83,100,100,80,Gen 1,0
3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,Gen 1,0
4,Charmander,Fire,missing,309,39,52,43,60,50,65,Gen 1,0


In [76]:
data.drop(['Name','Total'],axis=1,inplace=True)

In [88]:
data_dummy=pd.get_dummies(data,drop_first=True,dtype=int)

In [89]:
data_dummy.shape

(800, 47)

In [90]:
data_dummy.head()

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Type 1_Dark,Type 1_Dragon,Type 1_Electric,...,Type 2_Psychic,Type 2_Rock,Type 2_Steel,Type 2_Water,Type 2_missing,Generation_Gen 2,Generation_Gen 3,Generation_Gen 4,Generation_Gen 5,Generation_Gen 6
0,45,49,49,65,65,45,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60,62,63,80,80,60,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80,82,83,100,100,80,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,80,100,123,122,120,80,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,39,52,43,60,50,65,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Machine learning approach without Dimension reduction 

In [91]:
data1=data_dummy.copy()

In [92]:
y=data1['Legendary']
x=data1.drop('Legendary',axis=1)

In [93]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [94]:
rf=RandomForestClassifier(n_estimators=100,n_jobs=-1,bootstrap=True)

In [95]:
x.dtypes

HP                  int64
Attack              int64
Defense             int64
Sp. Atk             int64
Sp. Def             int64
Speed               int64
Type 1_Dark         int64
Type 1_Dragon       int64
Type 1_Electric     int64
Type 1_Fairy        int64
Type 1_Fighting     int64
Type 1_Fire         int64
Type 1_Flying       int64
Type 1_Ghost        int64
Type 1_Grass        int64
Type 1_Ground       int64
Type 1_Ice          int64
Type 1_Normal       int64
Type 1_Poison       int64
Type 1_Psychic      int64
Type 1_Rock         int64
Type 1_Steel        int64
Type 1_Water        int64
Type 2_Dark         int64
Type 2_Dragon       int64
Type 2_Electric     int64
Type 2_Fairy        int64
Type 2_Fighting     int64
Type 2_Fire         int64
Type 2_Flying       int64
Type 2_Ghost        int64
Type 2_Grass        int64
Type 2_Ground       int64
Type 2_Ice          int64
Type 2_Normal       int64
Type 2_Poison       int64
Type 2_Psychic      int64
Type 2_Rock         int64
Type 2_Steel

In [96]:
scores=cross_val_score(rf,x,y,scoring='roc_auc',cv=5)

In [97]:
scores

array([0.94531659, 0.80873888, 0.88016745, 0.87885924, 0.97409733])

In [98]:
np.mean(scores)

0.8974358974358975

## Machin learning approach with Dimention reduction technique

In [99]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,Input

In [100]:
x.shape

(800, 46)

In [102]:
#  before feeding all data to network use scaling
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler()
x_scale=scale.fit_transform(x)

In [104]:
x_scale.shape,x.shape

((800, 46), (800, 46))

In [105]:
embedding_dim=5

inputs=Input(shape=(x.shape[1],))

dense1=Dense(15,activation='relu')(inputs)

embedded_output=Dense(embedding_dim)(dense1)  # ------->>>>>>>>>. embed output

outputs=Dense(x.shape[1],activation='softmax')(embedded_output)

model=Model(inputs=inputs,outputs=outputs)

embedder=Model(inputs=inputs,outputs=embedded_output)

In [109]:
model.compile(optimizer='adam',loss='categorical_crossentropy',
              metrics=['mse'])
model.fit(x=x_scale,y=x_scale,epochs=100,batch_size=100)

Epoch 1/100


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 17.6503 - mse: 0.0710  
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 703us/step - loss: 17.4836 - mse: 0.0705
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 642us/step - loss: 17.6688 - mse: 0.0716
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 722us/step - loss: 17.8528 - mse: 0.0725
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 687us/step - loss: 17.6554 - mse: 0.0718
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 753us/step - loss: 17.4216 - mse: 0.0709
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step - loss: 17.3646 - mse: 0.0707
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step - loss: 17.3342 - mse: 0.0709
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 659us/step - loss:

<keras.src.callbacks.history.History at 0x15db13a30>

In [110]:
low_dim_type1=pd.DataFrame(embedder.predict(x_scale),columns=['t1_emb1','t1_emb2','t1_emb3','t1_emb4','t1_emb5'])
low_dim_type1.head()

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323us/step


Unnamed: 0,t1_emb1,t1_emb2,t1_emb3,t1_emb4,t1_emb5
0,146.957977,-133.468216,-151.063629,153.824814,-150.442459
1,161.95517,-147.083069,-166.496735,169.579865,-165.812851
2,182.649155,-165.860077,-187.789627,191.315201,-187.021118
3,199.629227,-181.229706,-205.218735,209.095306,-204.451096
4,149.088562,-135.396362,-153.200302,156.107147,-152.42421


In [111]:
low_dim_type1['Target']=y

In [112]:
x1=low_dim_type1.drop('Target',axis=1)
y1=low_dim_type1['Target']

In [113]:
rf=RandomForestClassifier(n_estimators=100,n_jobs=-1,bootstrap=True)

In [114]:
scores=cross_val_score(rf,x1,y1,scoring='roc_auc',cv=5)

In [115]:
scores.mean()

0.9065934065934066