In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pokemon-database/Pokemon Database.csv


In [2]:
import pandas as pd
import sklearn as skl
import plotly as plt
import csv

plt.offline.init_notebook_mode(connected=True)

First, we load the data, doing some preprocessing, preserve the columns we are interested, and rename them in the process: 

In [3]:
data_path = "/kaggle/input/pokemon-database/Pokemon Database.csv"

df_raw = pd.read_csv(data_path)
df_raw = df_raw.set_index('Pokemon Id')
df_raw.loc[df_raw['Original Pokemon ID'].notna(),'Legendary Type'] = \
    list(df_raw.loc()[df_raw[df_raw['Original Pokemon ID'].notna()]['Original Pokemon ID']]['Legendary Type'])

In [4]:
column_name_dict = {
    'Pokedex Number': 'nid', 
    'Pokemon Name': 'name', 
    'Alternate Form Name': 'form', 
    'Legendary Type': 'legendary', 
    'Pokemon Height': 'height', 
    'Pokemon Weight': 'weight', 
    'Primary Type': 'type_1', 
    'Secondary Type': 'type_2',
    'Health Stat': 'hp', 
    'Attack Stat': 'atk', 
    'Defense Stat': 'def', 
    'Special Attack Stat': 'satk', 
    'Special Defense Stat': 'sdef', 
    'Speed Stat': 'spd', 
    'Base Stat Total': 'bst', 
    'EV Yield Total': 'ev_total', 
}

df = df_raw[column_name_dict.keys()]
df.columns = column_name_dict.values()
df.name = [s.replace('"','') for s in df.name]
df.form = [s.replace('"','') if isinstance(s, str) else '' for s in df.form]
df.type_1 = [s.replace('"','') if isinstance(s, str) else '' for s in df.type_1]
df.type_2 = [s.replace('"','') if isinstance(s, str) else '' for s in df.type_2]
df = df.fillna(value={'form': '', 'legendary': ''})
df.reset_index(level=0, inplace=True)
df.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Pokemon Id,nid,name,form,legendary,height,weight,type_1,type_2,hp,atk,def,satk,sdef,spd,bst,ev_total
0,1,1,Bulbasaur,,,0.7,6.9,Grass,Poison,45,49,49,65,65,45,318,1
1,2,2,Ivysaur,,,1.0,13.0,Grass,Poison,60,62,63,80,80,60,405,2
2,3,3,Venusaur,,,2.0,100.0,Grass,Poison,80,82,83,100,100,80,525,3
3,4,3,Venusaur,Mega,,2.4,155.5,Grass,Poison,80,100,123,122,120,80,625,3
4,5,4,Charmander,,,0.6,8.5,Fire,,39,52,43,60,50,65,309,1


Next, we extract the numeric columns we are going to use and normalize them: 

In [5]:
TYPE_LIST = sorted(list(set(df.type_1)))
COL_STATS = ['hp', 'atk', 'def', 'satk', 'sdef', 'spd']

df_stats = df[COL_STATS+['ev_total']]
display(df_stats.describe().loc()[['mean', 'std']].style.set_caption('Before normalization'))

def normalize(df, population=None):
    if population is None:
        population = df
    df_desc = population.describe().loc()[['mean', 'std']]
    return (df-df_desc.loc['mean'])/df_desc.loc['std']

display(normalize(df_stats).describe().loc()[['mean', 'std']].style.set_caption('After normalization'))

Unnamed: 0,hp,atk,def,satk,sdef,spd,ev_total
mean,70.518587,80.942379,74.951673,73.317844,72.483271,68.844796,1.934015
std,26.805946,32.413067,31.099652,32.555803,27.872579,30.066764,0.753482


Unnamed: 0,hp,atk,def,satk,sdef,spd,ev_total
mean,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0


After that we can perform PCA (Principal Component Analysis) on them: 

In [6]:
from sklearn.decomposition import PCA

pca = PCA(random_state=227)
pca.fit(normalize(df_stats))
pcs = pca.components_

df_var_r = pd.DataFrame(pca.explained_variance_ratio_[:,np.newaxis], columns=['var_r'])
df_var_r.index = [f"pc{i}" for i in range(len(pcs))]

df_pc = pd.DataFrame(pca.components_, columns=COL_STATS+['ev_total'])
df_pc.index = [f"pc{i}" for i in range(len(pcs))]

In [7]:
display(
    pd.concat([df_pc, df_var_r],axis=1).style\
        .background_gradient(cmap='bwr_r', axis=None)\
        .format("{:.3}")
)


df_pc_revNorm = df_pc*df_stats.describe().loc['std']
df_pc_revNorm.loc['pc0'] /= df_pc_revNorm.loc['pc0'].ev_total 
display(df_pc_revNorm.loc()[['pc0']].style.format("{:.4}"))

Unnamed: 0,hp,atk,def,satk,sdef,spd,ev_total,var_r
pc0,0.359,0.381,0.332,0.391,0.391,0.277,0.483,0.494
pc1,0.123,0.00118,0.595,-0.305,0.243,-0.69,-0.0562,0.16
pc2,-0.311,-0.65,-0.0236,0.386,0.573,-0.0469,0.0105,0.115
pc3,0.782,-0.259,-0.451,0.138,-0.00626,-0.314,0.00807,0.0962
pc4,0.258,-0.295,0.0477,-0.698,0.318,0.505,0.0253,0.062
pc5,-0.0472,0.511,-0.369,-0.0539,0.58,-0.0571,-0.507,0.0385
pc6,0.281,-0.129,0.439,0.311,-0.147,0.297,-0.711,0.0334


Unnamed: 0,hp,atk,def,satk,sdef,spd,ev_total
pc0,26.39,33.89,28.37,34.91,29.92,22.87,1.0


Here we can see some interesting results. I include the **total EV (effort value) yield** becuase it usually coorelates to what "stage" that Pokemon is, for example for 3-stage evolution lines they are 1 to 3 for each stages, and for single stage legendary Pokemon it's usually 3. Also, this also works for the "intermediate" single stage Pokemon like Skarmory, Druddigon, and Torkoal, all of which have 2 total EV yield. 

`pc0` clearly shows how the stats of a Pokemon grows with stages. It also explains nealy 50% of the variation in the input data. By reverse the normalization and doing some calcuations with `pc0`, we can see that when total EV yield go up by 1, the total stat increases by about 176.9 in general.

In components besides `pc0`, `pc1` and `pc2` take up the main portion of the variation, both of which have nearly 0 on total EV yield, and explains 10% of the data variation. `pc1` shows the tradeoff between mainly **defence** and **speed**, which appears in game as the difference between *tanks* and *sweepers*. That is, Pokemon with more positive `pc1` components tent to be defensive, more balky, and take less damage, but at the same time slower and usually have less attack. On the other hand, Pokemon with more negative `pc1` components are faster, can hit before the opponents more easily, with the cost of lower defence and HP. 

`pc2` shows the difference between *physical-oriented* Pokemon and *special-oriented* Pokemon. In the game, there are 3 types of moves: *physical*, *special*, and *status*. *Physical moves* usually calculates its damage by physical stats (**attack** and **defense**), and *special move* by physical stats (**special attack** and **special defense**). *Status moves* inflict status on various objects, for example the user itself, the target, or even the game field. Some example of the effects of status moves includes increasing the attack of the user, decreasing the defense of the target, or alter the weather of the game field. Since effects of different types of moves depends on different stats, it's normal that player will catagorize Pokemon as more physical-oriented or more special-oriented. One example is the infamous "SkarBliss" combination, which includes Skarmory and Blissey:  

In [8]:
df[df.name.isin(['Skarmory', 'Blissey'])]

Unnamed: 0,Pokemon Id,nid,name,form,legendary,height,weight,type_1,type_2,hp,atk,def,satk,sdef,spd,bst,ev_total
220,351,227,Skarmory,,,1.7,50.5,Steel,Flying,65,80,140,40,70,70,465,2
235,366,242,Blissey,,,1.5,46.8,Normal,,255,10,10,75,135,55,540,3


Since the reason given above, attackers are often either physical attacker of special attacker based on thier stats, which are then usually given the cooresponing type of moves. In the "SkarBliss" combination, Skarmory has high defense (physical wall) and Blissey has high special defense (special wall) and high HP. At the same time, Skarmory also has move that can set up entry hazard (damages opposing Pokemon when switch in) and force the oppenct to switch respectively (a role which is often called phaser). 

`pc5` and `pc6` are somewhat interesting. Although only take up 6% of the explained variation, both of them mostly increase the base stats while decreasing total EV yield. Further more, the increased stats are compliment in `pc5` and `pc6`, meaning Pokemon with positive `pc5` or `pc6` components will have higher stats compared to other pokemon with the same total EV yield. What does this mean? We see after we analyze deeper...

To explore more about the data and take other factors into account, for example types of the Pokemon, and whether the Pokemon is legenday, I chart the data using a interactive scatter plot. This also uses the dropdown widgets from `ipywidgets` for selecting the x axis, y axis, and the color. You proably need to edit the notebook to make the widgets working. The size of the dot represents the total EV yield of each Pokemon. 

In [9]:
import plotly.express as px
import ipywidgets as widgets

TYPE_COLOR_MAP = {
    'Bug': 'lightgreen', 
    'Dark': 'black', 
    'Dragon': 'blue', 
    'Electric': 'yellow', 
    'Fairy': 'fuchsia', 
    'Fighting': 'orange', 
    'Fire': 'red', 
    'Flying': 'skyblue', 
    'Ghost': 'midnightblue', 
    'Grass': 'green', 
    'Ground': 'brown', 
    'Ice': 'aqua', 
    'Normal': 'gray', 
    'Poison': 'purple', 
    'Psychic': 'violet', 
    'Rock': 'teal', 
    'Steel': 'silver', 
    'Water': 'navy', 
}

df_plot = pd.concat([df[['nid', 'name', 'form', 'type_1', 'type_2', 'ev_total', 'legendary']], 
                     pd.DataFrame(pca.transform(normalize(df_stats)), 
                                  columns=[f"pc{i}" for i in range(len(pcs))])],
                    axis=1).copy()
pd.options.mode.chained_assignment = None
df_plot.type_2[df_plot.type_2==''] = df_plot.type_1[df_plot.type_2==''] 
df_plot.legendary[df_plot.legendary == ''] = 'None'
df_plot.form[df_plot.form == ''] = 'None'

def show_pcs_fig(df):
    def show_pcs_fig_df(x_axis, y_axis, color):
        fig = px.scatter(df, x=x_axis, y=y_axis, 
                         color=color, size='ev_total', 
                         hover_data=['name','form','legendary'],
                         size_max=6, 
                         color_discrete_map=TYPE_COLOR_MAP,
                         category_orders={'type_1': TYPE_LIST,
                                          'type_2': TYPE_LIST})
        return fig
    return show_pcs_fig_df


pcs_str = [f'pc{i}' for i in range(len(pcs))]
x_dropdown = widgets.Dropdown(options=pcs_str, value=pcs_str[1])
y_dropdown = widgets.Dropdown(options=pcs_str, value=pcs_str[2])
class_dropdown = widgets.Dropdown(options=['type_1', 'type_2', 'legendary'], value='type_1')

_ = widgets.interact(show_pcs_fig(df_plot), x_axis=x_dropdown, y_axis=y_dropdown, color=class_dropdown)

Besides that, we can also calculate the statistics of the components of Pokemon of different types. Here to be counted as a specific type, the Pokemon need any of its type being that type. For example, Skarmory is included in both Steel type and Flying type. 

In [10]:
display(pd.DataFrame({t: pd.DataFrame(pca.transform(normalize(df_stats[(df.type_1==t) | (df.type_2==t)],df_stats)), 
                                      columns=[f"pc{i}" for i in range(len(pcs))])\
                         .describe().loc()['mean']
                         for t in TYPE_LIST
                     }).style\
                       .background_gradient(cmap='bwr_r', axis=1)\
                       .format("{:.3}")
                       .set_caption('mean')
       )
display(pd.DataFrame({t: pd.DataFrame(pca.transform(normalize(df_stats[(df.type_1==t) | (df.type_2==t)],df_stats)), 
                                      columns=[f"pc{i}" for i in range(len(pcs))])\
                         .describe().loc()['std']
                         for t in TYPE_LIST
                     }).style\
                       .background_gradient(cmap='OrRd', axis=1)\
                       .format("{:.3}")
                       .set_caption('std')
       )

Unnamed: 0,Bug,Dark,Dragon,Electric,Fairy,Fighting,Fire,Flying,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
pc0,-0.73,0.19,1.44,0.11,0.175,0.532,0.356,0.213,0.204,-0.221,-0.00373,0.289,-0.575,-0.212,0.732,0.102,0.748,-0.151
pc1,0.124,-0.323,-0.0899,-0.595,0.108,-0.0445,-0.383,-0.533,0.0968,0.184,0.573,0.12,-0.198,-0.116,-0.2,1.03,0.959,0.0698
pc2,0.0566,-0.338,-0.129,0.295,0.663,-0.714,0.0948,0.0103,0.352,0.117,-0.552,-0.0401,-0.25,0.204,0.665,-0.346,-0.246,0.0892
pc3,-0.244,0.00658,0.148,-0.125,0.108,-0.206,0.0499,-0.017,-0.15,0.0397,-0.096,0.221,0.341,0.11,0.108,-0.452,-0.57,0.0596
pc4,0.116,-0.0337,-0.162,-0.131,-0.038,0.133,-0.241,0.202,-0.219,-0.136,-0.00616,0.0254,0.364,-0.0143,-0.0783,-0.0305,-0.104,-0.0339
pc5,-0.0911,0.133,0.00319,-0.136,0.0196,0.205,0.0077,-0.0714,0.0224,-0.0434,-0.0877,0.19,0.0608,-0.0231,-0.0217,-0.107,-0.154,-0.0409
pc6,-0.221,-0.0443,0.166,0.0691,-0.187,-0.211,0.0287,0.0153,0.0145,-0.093,0.0205,0.0946,0.0068,-0.0377,0.0196,0.102,0.0824,0.0921


Unnamed: 0,Bug,Dark,Dragon,Electric,Fairy,Fighting,Fire,Flying,Ghost,Grass,Ground,Ice,Normal,Poison,Psychic,Rock,Steel,Water
pc0,1.73,1.81,2.23,1.62,1.96,1.68,1.76,1.83,1.77,1.68,1.86,1.79,1.73,1.93,2.0,1.59,1.77,1.75
pc1,1.23,0.948,0.902,1.01,0.852,0.83,0.868,0.758,1.3,0.76,1.02,0.998,0.742,0.983,1.17,1.35,1.26,0.927
pc2,0.954,0.701,0.863,0.702,0.96,0.714,0.797,0.777,0.897,0.691,0.657,1.13,0.687,0.729,0.864,1.22,0.992,0.807
pc3,0.641,0.772,1.05,0.586,0.929,0.671,0.461,0.557,0.796,0.586,0.856,0.665,1.15,0.635,0.817,0.807,0.846,0.808
pc4,0.624,0.697,0.786,0.682,0.638,0.603,0.604,0.601,0.678,0.585,0.588,0.641,0.596,0.728,0.768,0.641,0.691,0.591
pc5,0.665,0.494,0.468,0.356,0.477,0.593,0.531,0.499,0.549,0.385,0.473,0.479,0.488,0.485,0.476,0.592,0.616,0.546
pc6,0.606,0.397,0.601,0.472,0.47,0.433,0.415,0.484,0.493,0.474,0.519,0.417,0.412,0.64,0.484,0.435,0.481,0.456


In the statisics, one thing that immediately stand out is how high the mean of `pc0` componenet of Dragon type Pokemon is. This is expected however, considering that how many Dragon type Pokemon are legendary, all of which has 3 total EV yield. 

There are also some other interesting things related to my intepretation of the components and the type of the Pokemons. To see that in detail, first we need to define functions for filtering Pokemon by types: 

In [11]:
df_prop_ = df[['nid', 'name', 'form', 'legendary', 'height', 'weight', 'type_1', 'type_2', 'ev_total']]
df_pca_ = pd.DataFrame(pca.transform(normalize(df_stats)), 
                      columns=[f"pc{i}" for i in range(len(pcs))])

df_pca = pd.concat([df_prop_, df_pca_], axis=1)

In [12]:
def type_filter(df, t):
    df_type_ = df_pca[(df.type_1==t) | (df.type_2==t)].copy()
    df_type_.insert(len(df_type_.columns)-7, 'type', t)
    return df_type_

def types_filter(df, ts):
    return pd.concat([type_filter(df,t) for t in ts], axis=0)

types_filter(df_pca, ['Flying', 'Steel'])

Unnamed: 0,nid,name,form,legendary,height,weight,type_1,type_2,ev_total,type,pc0,pc1,pc2,pc3,pc4,pc5,pc6
6,6,Charizard,,,1.7,90.5,Fire,Flying,3,Flying,1.743344,-0.926298,0.496311,-0.016220,-0.014164,-0.576141,-0.313946
8,6,Charizard,Mega Y,,1.7,100.5,Fire,Flying,3,Flying,2.999104,-1.132346,1.305179,0.028636,-0.925487,0.281003,-0.075003
23,12,Butterfree,,,1.1,32.0,Bug,Flying,3,Flying,0.170578,-0.723801,1.227397,0.410765,-0.028847,-0.842905,-1.194176
60,41,Zubat,,,0.8,7.5,Poison,Flying,1,Flying,-2.959718,-0.396269,-0.067489,-0.064655,0.266143,0.011497,-0.238112
61,42,Golbat,,,1.6,55.0,Poison,Flying,2,Flying,0.214878,-0.579369,-0.116144,-0.098051,0.692666,0.008137,0.084745
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023,618,Stunfisk,Galar,,0.7,20.5,Ground,Steel,2,Steel,0.548931,1.646004,-0.258040,1.125177,0.078208,-0.075054,0.185359
1029,303,Mawile,Mega,,1.0,23.5,Steel,Fairy,2,Steel,0.507783,1.659343,-0.006087,-1.402177,-0.003766,0.311726,-0.147406
1031,448,Lucario,Mega,,1.3,57.5,Fighting,Steel,2,Steel,2.090328,-1.390873,-0.615077,-0.883934,-1.299023,0.568217,0.937641
1039,413,Wormadam,Trash,,0.5,6.5,Bug,Steel,2,Steel,-0.063041,1.319912,0.810391,-0.181908,-0.161285,0.085744,-0.327363


For example, we can see using `pc1` in the following chart that Flying types and Electric types are generaly faster, while Rock types and Steel types are more defensive: 

In [13]:
show_pcs_fig(types_filter(df_pca, ['Flying', 'Electric', 'Steel', 'Rock']))('pc1', 'pc2', 'type')

In this chart, we can see 2 outliners for `pc1`: Regieleki for being negative and Shuckle for being positive. It's not hard to see why by looking at their stats: 

In [14]:
df[df.name.isin(['Regieleki', 'Shuckle'])]

Unnamed: 0,Pokemon Id,nid,name,form,legendary,height,weight,type_1,type_2,hp,atk,def,satk,sdef,spd,bst,ev_total
206,337,213,Shuckle,,,0.6,20.5,Bug,Rock,20,10,230,10,230,5,505,2
1068,1598,894,Regieleki,,"""Sub-Legendary""",1.2,145.0,Electric,,80,100,50,100,50,200,580,3


Likewise, we can see using `pc2` in the following chart that Fighting types and Ground types are generaly more physical-oriented, while Fairy types and Physic types are more special-oriented:

In [15]:
show_pcs_fig(types_filter(df_pca, ['Fighting', 'Ground', 'Fairy', 'Psychic']))('pc1', 'pc2', 'type')

Now let's try to answer what do `pc5` and `pc6` represent. Ploting using `pc5` and `pc6` give us these: 

In [16]:
show_pcs_fig(df_plot)('pc5', 'pc6', 'form')

Some Pokemon with large positive `pc5` or `pc6` are in Mega form, Primal form, or Eternamax form. These form in game are temporary and need some requirements, for example needing to have special item on Pokemon, but at the same time give the Pokemon adventage by increasing stats, changing ability and such. Thus, we can see that `pc5` and `pc6` can be use to represent how high the stats of the Pokemon is compared to Pokemon of the same stage. Further more, we can plot legenday Pokemon using `pc5` and `pc6`: 

In [17]:
fig = show_pcs_fig(df_plot[['legendary' in s.lower() for s in df_plot.legendary]])('pc5', 'pc6', 'legendary')
fig.add_shape(
            type="line",x0=-2,y0=2,x1=2,y1=-2,
            line={'color': 'MediumPurple','width': 1, 'dash': "dot"}
)

In this chart be can see 2 groups, one above the diagonal (`pc5+pc6>0`) and one below it (`pc5+pc6<0`). Most of the Pokemon above the line are either so-called "box legenday" i.e. legenday Pokemon that appeared as the box art of the game, or some form of the other legenday, for example Mega Latias and Mega Latios.  