In [43]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# 1. Import data

In [44]:
df_pokemon = pd.read_csv('https://raw.githubusercontent.com/LucaSainteCroix/teaching-resources/main/exercises-data/pokemon.csv')
df_pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,4,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,5,Charmander,Fire,,39,52,43,60,50,65,1,False


In [45]:
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int64 
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int64(8), object(3)
memory usage: 69.7+ KB


In [46]:
df_pokemon.isna().sum()

#               0
Name            0
Type 1          0
Type 2        386
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

# 2. Find closest pokemons

## 2.1 Prepare data

In [47]:
 # we noticed that there are missing values, we will not be able to identify these Type 2 so we can delete it
df_pokemon.drop("Type 2", axis=1, inplace=True)
df_pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   HP          800 non-null    int64 
 4   Attack      800 non-null    int64 
 5   Defense     800 non-null    int64 
 6   Sp. Atk     800 non-null    int64 
 7   Sp. Def     800 non-null    int64 
 8   Speed       800 non-null    int64 
 9   Generation  800 non-null    int64 
 10  Legendary   800 non-null    bool  
dtypes: bool(1), int64(8), object(2)
memory usage: 63.4+ KB


In [48]:
# The other solution would be to replace those with a 'no_type' type.

In [49]:
df_pokemon = pd.concat([df_pokemon, pd.get_dummies(df_pokemon["Type 1"])], axis = 1).drop(columns=['Type 1'])

In [50]:
df_pokemon = df_pokemon.drop(columns = ["#", "Generation"])
df_pokemon.head()

Unnamed: 0,Name,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,Bulbasaur,45,49,49,65,65,45,False,0,0,...,0,1,0,0,0,0,0,0,0,0
1,Ivysaur,60,62,63,80,80,60,False,0,0,...,0,1,0,0,0,0,0,0,0,0
2,Venusaur,80,82,83,100,100,80,False,0,0,...,0,1,0,0,0,0,0,0,0,0
3,Mega Venusaur,80,100,123,122,120,80,False,0,0,...,0,1,0,0,0,0,0,0,0,0
4,Charmander,39,52,43,60,50,65,False,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
number_columns = df_pokemon.select_dtypes('number').columns


In [53]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df_pokemon[number_columns] = scaler.fit_transform(df_pokemon[number_columns])

df_pokemon.head()

Unnamed: 0,Name,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,Bulbasaur,0.173228,0.237838,0.195556,0.298913,0.214286,0.228571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ivysaur,0.232283,0.308108,0.257778,0.380435,0.285714,0.314286,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Venusaur,0.311024,0.416216,0.346667,0.48913,0.380952,0.428571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mega Venusaur,0.311024,0.513514,0.524444,0.608696,0.47619,0.428571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Charmander,0.149606,0.254054,0.168889,0.271739,0.142857,0.342857,False,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.2 Find non legendary pokemons (X)

In [54]:
# we create a new df without the legendary Pokemons (we cannot propose them to the champion)
# recommandations will come from that df, our model KNN will be fitted with this df
df_pokemon_nonlegendary = df_pokemon.loc[df_pokemon["Legendary"] == False]

In [55]:
df_pokemon_nonlegendary

Unnamed: 0,Name,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,Bulbasaur,0.173228,0.237838,0.195556,0.298913,0.214286,0.228571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ivysaur,0.232283,0.308108,0.257778,0.380435,0.285714,0.314286,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Venusaur,0.311024,0.416216,0.346667,0.489130,0.380952,0.428571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mega Venusaur,0.311024,0.513514,0.524444,0.608696,0.476190,0.428571,False,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Charmander,0.149606,0.254054,0.168889,0.271739,0.142857,0.342857,False,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,Gourgeist Super Size,0.330709,0.513514,0.520000,0.260870,0.261905,0.280000,False,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
788,Bergmite,0.212598,0.345946,0.355556,0.119565,0.071429,0.131429,False,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
789,Avalugg,0.370079,0.605405,0.795556,0.184783,0.123810,0.131429,False,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
790,Noibat,0.153543,0.135135,0.133333,0.190217,0.095238,0.285714,False,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# keep only numeric variable, remove column # which as no use as an explanatory variable
X = df_pokemon_nonlegendary.drop(columns = ["Name", "Legendary"])
X

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Bug,Dark,Dragon,Electric,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
0,0.173228,0.237838,0.195556,0.298913,0.214286,0.228571,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.232283,0.308108,0.257778,0.380435,0.285714,0.314286,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.311024,0.416216,0.346667,0.489130,0.380952,0.428571,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.311024,0.513514,0.524444,0.608696,0.476190,0.428571,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.149606,0.254054,0.168889,0.271739,0.142857,0.342857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
787,0.330709,0.513514,0.520000,0.260870,0.261905,0.280000,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
788,0.212598,0.345946,0.355556,0.119565,0.071429,0.131429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
789,0.370079,0.605405,0.795556,0.184783,0.123810,0.131429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
790,0.153543,0.135135,0.133333,0.190217,0.095238,0.285714,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.3 Fit the model

In [58]:
# fit model with 3 neighbors
# note that this model is trained with X for which variables come from df_pokemon_nonlegendary
modelNN = NearestNeighbors(n_neighbors=3)
modelNN.fit(X)

## 2.4 Find champion's pokemon

In [59]:
# make a list of the champion's Pokemons
df_pokemon.loc[df_pokemon['Name'].str.contains('Giratina')]  # warning : Giratina has two different names

Unnamed: 0,Name,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Legendary,Bug,Dark,...,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
544,Giratina Altered Forme,0.586614,0.513514,0.511111,0.48913,0.47619,0.485714,True,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
545,Giratina Origin Forme,0.586614,0.621622,0.422222,0.597826,0.380952,0.485714,True,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
list_champion = ['Mewtwo','Lugia','Rayquaza','Giratina Origin Forme', 'Dialga', 'Palkia']

## 2.5 Find closest pokemons

In [64]:
# for each Pokemon of the list find 3 nearest neighbours
for Pokemon in list_champion:
    # for a Pokemon, pass only same columns as X that our model needs
    neighbors = modelNN.kneighbors(df_pokemon.loc[df_pokemon['Name'] == Pokemon, X.columns])
    print(f"Recommandations for Pokemon {Pokemon} :")
    # find row number (not row name) from the nearest neighbors into dataframe on which the model was fitted
    closest_pok_ind = neighbors[1][0]
    closest_pok = df_pokemon_nonlegendary['Name'].iloc[closest_pok_ind] # ! warning ! we can use df_pokemon_nonlegendary because X is based on this dataframe
    print("Closest Pokemons : ", list(closest_pok))
    print("Respectives distances : ", neighbors[0][0])
    print()

Recommandations for Pokemon Mewtwo :
Closest Pokemons :  ['Celebi', 'Mew', 'Mega Gardevoir']
Respectives distances :  [0.35105551 0.35105551 0.36313814]

Recommandations for Pokemon Lugia :
Closest Pokemons :  ['Cresselia', 'Celebi', 'Mew']
Respectives distances :  [0.23836078 0.30593964 0.30593964]

Recommandations for Pokemon Rayquaza :
Closest Pokemons :  ['Mega Garchomp', 'Salamence', 'Mega Salamence']
Respectives distances :  [0.22719263 0.24585022 0.28438836]

Recommandations for Pokemon Giratina Origin Forme :
Closest Pokemons :  ['Drifblim', 'Trevenant', 'Mega Banette']
Respectives distances :  [0.43188595 0.46285001 0.47072687]

Recommandations for Pokemon Dialga :
Closest Pokemons :  ['Mega Metagross', 'Metagross', 'Klinklang']
Respectives distances :  [0.342608   0.34558251 0.48074858]

Recommandations for Pokemon Palkia :
Closest Pokemons :  ['Mega Blastoise', 'Manaphy', 'Keldeo Ordinary Forme']
Respectives distances :  [0.2030705  0.31008034 0.32378038]



In [13]:
# Do the Same nut now filter by Type 1 before

In [14]:
# for each Pokemon of the list find 3 nearest neighbours
for Pokemon in list_champion:
    type_1 = df_pokemon.loc[df_pokemon['Name'] == Pokemon]['Type 1'].values[0]

    same_type_pokemons = df_pokemon_nonlegendary.loc[df_pokemon_nonlegendary['Type 1'] == type_1]
    X = same_type_pokemons.select_dtypes('number').drop(columns = "#")

    modelNN = NearestNeighbors(n_neighbors=3)
    modelNN.fit(X)
    # for a Pokemon, pass only same columns as X that our model needs
    neighbors = modelNN.kneighbors(df_pokemon.loc[df_pokemon['Name'] == Pokemon, X.columns])
    print(f"Recommandations for Pokemon {Pokemon} (Type 1 {type_1}):")
    # find row number (not row name) from the nearest neighbors into dataframe on which the model was fitted
    closest_pok_ind = neighbors[1][0]
    closest_pok = same_type_pokemons['Name'].iloc[closest_pok_ind] # ! warning ! we can use df_pokemon_nonlegendary because X is based on this dataframe
    print("Closest Pokemons : ", list(closest_pok))
    print("Respectives distances : ", neighbors[0][0])
    print()

Recommandations for Pokemon Mewtwo (Type 1 Psychic):
Closest Pokemons :  ['Mew', 'Celebi', 'Espeon']
Respectives distances :  [64.43601477 64.44377394 74.88658091]

Recommandations for Pokemon Lugia (Type 1 Psychic):
Closest Pokemons :  ['Cresselia', 'Celebi', 'Mew']
Respectives distances :  [46.10856753 64.43601477 64.44377394]

Recommandations for Pokemon Rayquaza (Type 1 Dragon):
Closest Pokemons :  ['Mega Garchomp', 'Salamence', 'Mega Salamence']
Respectives distances :  [44.37341546 46.36809248 57.00877125]

Recommandations for Pokemon Giratina Origin Forme (Type 1 Ghost):
Closest Pokemons :  ['Drifblim', 'Trevenant', 'Gourgeist Super Size']
Respectives distances :  [ 88.61151167  97.0051545  104.29765098]

Recommandations for Pokemon Dialga (Type 1 Steel):
Closest Pokemons :  ['Metagross', 'Mega Metagross', 'Klinklang']
Respectives distances :  [65.19969325 66.71581522 93.01075207]

Recommandations for Pokemon Palkia (Type 1 Water):
Closest Pokemons :  ['Mega Blastoise', 'Manaphy