Import standard python exploration modules

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

Load pokemon.csv into the variable df

In [2]:
df = pd.read_csv('../resources/pokemon.csv')

Look at the properties of df, .shape for dimensions and .columns for index names

In [3]:
df.shape

(801, 41)

In [4]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')

How are against_ columns determined?

In [5]:
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


From domain knowledge, the against_**type** features are the damage multipliers used when a move of **type** is used against the pokemon

Checking that base_total is the total stat value for a pokemon

In [14]:
stats = ['attack', 'defense', 'hp', 'sp_attack', 'sp_defense', 'speed']
df[stats].sum(axis=1)[:5], df['base_total'][:5]

(0    318
 1    405
 2    625
 3    309
 4    405
 dtype: int64, 0    318
 1    405
 2    625
 3    309
 4    405
 Name: base_total, dtype: int64)

Checking for null values

In [24]:
801 - df.count()

abilities              0
against_bug            0
against_dark           0
against_dragon         0
against_electric       0
against_fairy          0
against_fight          0
against_fire           0
against_flying         0
against_ghost          0
against_grass          0
against_ground         0
against_ice            0
against_normal         0
against_poison         0
against_psychic        0
against_rock           0
against_steel          0
against_water          0
attack                 0
base_egg_steps         0
base_happiness         0
base_total             0
capture_rate           0
classfication          0
defense                0
experience_growth      0
height_m              20
hp                     0
japanese_name          0
name                   0
percentage_male       98
pokedex_number         0
sp_attack              0
sp_defense             0
speed                  0
type1                  0
type2                384
weight_kg             20
generation             0


Missing 384 type2  
* For pokemon with a single type  

Missing 98 percentage_male  
* For pokemon with a single gender  

Missing 20 height_m and weight_kg  
* Unsure of the reason why

Look at pokemon missing the percentage_male feature

In [27]:
df['name'][df['percentage_male'].isna()].head()

80     Magnemite
81      Magneton
99       Voltorb
100    Electrode
119       Staryu
Name: name, dtype: object

Oh, these are the genderless pokemon, ie legendaries and other ungendered pokemon  
Are single-gender pokemon given 0/100 in the percentage_male feature? Yes

In [29]:
df['percentage_male'][df['name'] == 'Miltank']

240    0.0
Name: percentage_male, dtype: float64

In [30]:
df['percentage_male'][df['name'] == 'Gallade']

474    100.0
Name: percentage_male, dtype: float64

No reason yet to fill missing values in percentage_male feature now that I know the reason that they are missing

Look at the missing height and weight measurements

In [87]:
df[['name', 'height_m', 'weight_kg']][df['height_m'].isna() & df['weight_kg'].isna()].count()

name         20
height_m      0
weight_kg     0
dtype: int64

All pokemon are missing height and weight or neither

In [88]:
df[['name', 'height_m', 'weight_kg']][df['height_m'].isna() & df['weight_kg'].isna()]

Unnamed: 0,name,height_m,weight_kg
18,Rattata,,
19,Raticate,,
25,Raichu,,
26,Sandshrew,,
27,Sandslash,,
36,Vulpix,,
37,Ninetales,,
49,Diglett,,
50,Dugtrio,,
51,Meowth,,


These are the pokemon with multiple forms, I would like to include both

I can hand encode the base form values from Serebii and add the alternate forms to the dataframe  
What are the values for type if the alternate forms have a different type, eg Rattata Normal but Alolan Rattata Normal/Dark

In [98]:
df['height_m'].iloc[18] = 0.3
df['weight_kg'].iloc[18] = 3.5
df['height_m'].iloc[19] = 0.7
df['weight_kg'].iloc[19] = 18.5
df['height_m'].iloc[25] = 0.8
df['weight_kg'].iloc[25] = 30.0
df['height_m'].iloc[26] = 0.6
df['weight_kg'].iloc[26] = 12.0
df['height_m'].iloc[27] = 1.0
df['weight_kg'].iloc[27] = 29.5
df['height_m'].iloc[36] = 0.6
df['weight_kg'].iloc[36] = 9.9
df['height_m'].iloc[37] = 1.1
df['weight_kg'].iloc[37] = 19.9 # Continue below
df['height_m'].iloc[49] = 0.3
df['weight_kg'].iloc[49] = 3.5
df['height_m'].iloc[50] = 0.3
df['weight_kg'].iloc[50] = 3.5
df['height_m'].iloc[51] = 0.3
df['weight_kg'].iloc[51] = 3.5
df['height_m'].iloc[52] = 0.3
df['weight_kg'].iloc[52] = 3.5
df['height_m'].iloc[73] = 0.3
df['weight_kg'].iloc[73] = 3.5
df['height_m'].iloc[74] = 0.3
df['weight_kg'].iloc[74] = 3.5
df['height_m'].iloc[75] = 0.3
df['weight_kg'].iloc[75] = 3.5
df['height_m'].iloc[87] = 0.3
df['weight_kg'].iloc[87] = 3.5
df['height_m'].iloc[88] = 0.3
df['weight_kg'].iloc[88] = 3.5
df['height_m'].iloc[102] = 0.3
df['weight_kg'].iloc[102] = 3.5
df['height_m'].iloc[104] = 0.3
df['weight_kg'].iloc[104] = 3.5
df['height_m'].iloc[719] = 0.3
df['weight_kg'].iloc[719] = 3.5
df['height_m'].iloc[744] = 0.3
df['weight_kg'].iloc[744] = 3.5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


height_m     0.3
weight_kg    3.5
Name: 18, dtype: float64

What is the classification feature? Is that the fake scientific name, ie 'Flower-dino' for Bulbasaur  
**classi**fication is misspelled as **class**fication

In [33]:
df[['name', 'classfication']].head()

Unnamed: 0,name,classfication
0,Bulbasaur,Seed Pokémon
1,Ivysaur,Seed Pokémon
2,Venusaur,Seed Pokémon
3,Charmander,Lizard Pokémon
4,Charmeleon,Flame Pokémon


What is experience_growth a measure of? Category for ease of exp gain, exp points to lvl 100, or something else

In [39]:
df[['name', 'experience_growth']][:5], df[['name', 'experience_growth']][20:25]

(         name  experience_growth
 0   Bulbasaur            1059860
 1     Ivysaur            1059860
 2    Venusaur            1059860
 3  Charmander            1059860
 4  Charmeleon            1059860,        name  experience_growth
 20  Spearow            1000000
 21   Fearow            1000000
 22    Ekans            1000000
 23    Arbok            1000000
 24  Pikachu            1000000)

Values are the same as Serebii's Experience Growth measure:  
Erratic - 600,000 EXP  
Fast - 800,000 EXP  
Medium-Fast - 1,000,000 EXP  
Medium-Slow - 1,059,860 EXP  
Slow - 1,250,000 EXP  
Fluctuating - 1,640,000 EXP

Add categorical feature for experience growth

In [42]:
def expCat(row): #Categorize experience_growth feature according to Serebii's data
    if row == 600000:
        return 'Erratic'
    elif row == 800000:
        return 'Fast'
    elif row == 1000000:
        return 'Medium-Fast'
    elif row == 1059860:
        return 'Medium-Slow'
    elif row == 1250000:
        return 'Slow'
    elif row == 1640000:
        return 'Fluctuating'
df['experience_cat'] = df['experience_growth'].apply(expCat)

In [47]:
df['experience_cat'].count()
#Check that there are no null-values, I was not certain that all values of eperience_growth were included in else ifs

801

Is the japanese_name category in katakana (script for onomatopoeic and imported words) or romaji (latin spellings of Japanese words)? Both

In [50]:
df['japanese_name'].head()

0    Fushigidaneフシギダネ
1     Fushigisouフシギソウ
2    Fushigibanaフシギバナ
3        Hitokageヒトカゲ
4         Lizardoリザード
Name: japanese_name, dtype: object

Split japanese_name feature into romaji and katakana

In [80]:
df['romaji_name'] = df['japanese_name'].str.extract(r'([a-zA-Z0-9\s\.\-:]+)')
#Regex for latin characters, could not see an easy way to match mars and venus symbols (male and female Nidoran)
df['romaji_name'].head()

0    Fushigidane
1     Fushigisou
2    Fushigibana
3       Hitokage
4        Lizardo
Name: romaji_name, dtype: object

Python does not seem to have great support for capturing katakana, I did not have plans for how to use a katakana feature so I will settle for the romaji

Is the generation feature in roman numerals or arabic? Arabic

In [84]:
df['generation'].unique()

array([1, 2, 3, 4, 5, 6, 7])

The is_legendary feature is binary

In [85]:
df['is_legendary'].unique()

array([0, 1])

In [48]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary', 'experience_cat'],
      dtype='object')