# Preprocessing cell for the Pokémon Data Science Project

In [205]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import missingno as msno

plt.figure(figsize=(4, 3), dpi=60)
plt.rcParams['figure.figsize'] = [4, 3]
plt.rcParams['figure.dpi'] = 60
sns.set_theme(rc={'figure.figsize':(4,3)})

<Figure size 240x180 with 0 Axes>

### Data reading

In [206]:
data = pd.read_csv("../data/Pokemon.csv", index_col=None)

### About the data, from the Bulbapedia:

The dataset was extracted using the file bulbapediascrapper.py. The main objective of this notebook is to preprocess the attributes in order to perform visualizations and AI in further stages.

### This notebook's tasks:

1. Initial and basic exploration of contents of all datasets
2. Preprocessing

## 1. Initial exploration and cleaning of contents of all datasets

In [207]:
PLOT = True

def describe_feature(data, feat):
    print(f"- Type: {data.loc[:, feat].dtype}")
    print(f"- First rows:\n{data.loc[:, feat].head(5)}")
    print(f"- Last rows:\n{data.loc[:, feat].tail(5)}")
    print(f"- Number of missing values: {data.loc[:, feat].isna().sum()}")
    print(data.loc[:, feat].dtype)
    print(f"- Number of distinct values: {data.loc[:, feat].nunique()}")
    with pd.option_context('display.max_rows', None):
        print(f"- Unique value counts:\n{data.loc[:, feat].value_counts()}")
    if data.loc[:, feat].dtype in ['int64', 'float64']:
        print(f"- Min: {data.loc[:, feat].min()}")
        print(f"- Mean: {data.loc[:, feat].mean()}")
        print(f"- Median: {data.loc[:, feat].median()}")
        print(f"- Max: {data.loc[:, feat].max()}")
        print(f"- Std: {data.loc[:, feat].std()}")
    else:
        print(f"- Unique values: {data.loc[:, feat].unique()}")


def remove_first_last_letter(series, char):
    series = pd.Series([sublist[1:-1] if len(sublist) >= 3 else sublist 
                      for sublist in series])
    return series.str[1:-1].str.split(f"\'{char} \'")

def get_first_element(series):
    def process_list_first(lst):
        return lst[0]
    
    return series.apply(process_list_first)

def get_second_element(series):
    def process_list_second(lst):
        if len(lst) >= 2:
            return lst[1]
        elif len(lst) == 1:
            return 'None'
    
    return series.apply(process_list_second)


def get_third_element(series):
    def process_list_third(lst):
        if len(lst) >= 3:
            return lst[2]
        elif len(lst) < 3:
            return 'None'
    
    return series.apply(process_list_third)


def remove_last_letter(series):
    def rem_last_letter(lst):
        return lst[:-1]
    
    return series.apply(rem_last_letter)

def pieplot(dt, feat, title):
    if feat in dt.columns:
        _, ax = plt.subplots()
        labels = dt[feat].value_counts().index.tolist()
        values = [dt.loc[dt[feat] == x,:].shape[0] for x in dt[feat].value_counts().index.tolist()]
        ax.pie(values, labels=labels, autopct='%1.1f%%')
        plt.show()


def violinplot(dt, feat, title, div=None):
    if feat in dt.columns:
        if not div is None:
            sns.violinplot(data = dt, x=div, y=feat, split=True)
        else:
            sns.violinplot(data = dt, y=feat)
        plt.title(title)
        plt.show()


Let's understand the content of each file. Let's start with the training dataset

In [208]:
print("Pokémon dataset")
print(f"Number of rows: {data.shape[0]}, number of columns: {data.shape[1]}")
print(f"Column names: {data.columns}")
print(f"Number of missing values: {data.isna().sum().sum()}")

Pokémon dataset
Number of rows: 1179, number of columns: 47
Column names: Index(['DexNumber', 'Name', 'Type', 'Abilities', 'HiddenAbility', 'Generation',
       'Hp', 'Attack', 'Defense', 'SpecialAttack', 'SpecialDefense', 'Speed',
       'TotalStats', 'Weight', 'Height', 'GenderProbM', 'Category',
       'CatchRate', 'EggCycles', 'EggGroup', 'LevelingRate', 'BaseFriendship',
       'IsLegendary', 'IsMythical', 'IsUltraBeast', 'HasMega', 'EvoStage',
       'TotalEvoStages', 'PreevoName', 'DamageFromNormal',
       'DamageFromFighting', 'DamageFromFlying', 'DamageFromPoison',
       'DamageFromGround', 'DamageFromRock', 'DamageFromBug',
       'DamageFromGhost', 'DamageFromSteel', 'DamageFromFire',
       'DamageFromWater', 'DamageFromGrass', 'DamageFromElectric',
       'DamageFromPsychic', 'DamageFromIce', 'DamageFromDragon',
       'DamageFromDark', 'DamageFromFairy'],
      dtype='object')
Number of missing values: 0


### Attribute information:

1. **DexNumber**: Number of the Pokémon for the national dex
2. **Name**: Name of the Pokémon
3. **Type**: Pokémon's typing as a list
4. **Abilities**: Pokémon's abilities as a list
5. **HiddenAbility**: Pokémon's Hidden Ability
6. **Generation**: The generation where it was introduced
7. **Hp**: Hp base stat
8. **Attack**: Attack base stat
9. **Defense**: Defense base stat
10. **SpecialAttack**: Special attack base stat
11. **SpecialDefense**: Special defense base stat
12. **Speed**: Speed base stat
13. **TotalStats**: Total stats (sum of the previous six stats)
14. **Weight**: Weight in kg
15. **Height**: Height in m
16. **GenderProbM**: Probability of a Pokémon of that species being male (if it has unknown gender, it will be None)
17. **Category**: Category of that Pokémon (some distinct Pokémons have the same categories, and it may vary between evolutions)
18. **CatchRate**: Capture rate of that Pokémon
19. **EggCycles**: Number of cycles (steps, the number of steps in each cycle varies among games) to hatch an egg of that Pokémon
20. **EggGroup**: Egg Group(s) of that Pokémon
21. **LevelingRate**: Class of the XP growth of that Pokémon
22. **BaseFriendship**: Base friendship of that Pokémon
23. **IsLegendary**: Denotes if it is a legendary pokemon
24. **IsLegendary**: Denotes if it is a legendary pokemon
25. **IsMythical**: Denotes if it is a mythical pokemon
26. **IsUltraBeast**: Denotes if it is an ultra beast
27. **HasMega**: Has a Mega evolution
28. **EvoStage**: Evolution Stage of that Pokémon
29. **TotalEvoStages**: Total evolution stages for that Pokémon
30. **PreevoName**: Name of the Preevolution (in case that Pokémon has one)
31. **DamageFrom(Type)**: Amount of damage taken for a specific attack type

In [209]:
data.describe()

Unnamed: 0,DexNumber,Hp,Attack,Defense,SpecialAttack,SpecialDefense,Speed,TotalStats,Weight,Height,...,DamageFromSteel,DamageFromFire,DamageFromWater,DamageFromGrass,DamageFromElectric,DamageFromPsychic,DamageFromIce,DamageFromDragon,DamageFromDark,DamageFromFairy
count,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,...,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0,1179.0
mean,514.177269,70.810857,78.972858,73.07888,71.973707,71.754029,69.173028,442.075488,66.536811,1.205344,...,0.998092,1.150127,1.050891,0.994275,1.039652,0.985369,1.199746,0.964377,1.062341,1.090755
std,296.629667,26.49238,30.317913,28.87529,31.254204,27.24567,29.568612,122.72611,119.956411,1.216914,...,0.516028,0.694725,0.594129,0.72549,0.63179,0.518389,0.736413,0.385435,0.469953,0.534795
min,1.0,1.0,5.0,5.0,10.0,20.0,5.0,180.0,0.1,0.1,...,0.25,0.25,0.25,0.25,0.0,0.0,0.25,0.0,0.25,0.25
25%,255.5,51.0,55.0,50.0,50.0,50.0,45.0,330.0,8.35,0.5,...,0.5,0.5,0.5,0.5,0.5,1.0,0.5,1.0,1.0,1.0
50%,525.0,70.0,75.0,70.0,65.0,70.0,67.0,464.0,28.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,762.5,85.0,100.0,90.0,95.0,90.0,90.0,525.0,70.75,1.5,...,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
max,1025.0,255.0,181.0,230.0,180.0,230.0,200.0,1125.0,999.9,20.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0


### Feature DexNumber

In [210]:
describe_feature(data, 'DexNumber')

- Type: int64
- First rows:
0    494
1      1
2      2
3      3
4      4
Name: DexNumber, dtype: int64
- Last rows:
1174    1022
1175    1023
1176    1024
1177    1024
1178    1025
Name: DexNumber, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 1025
- Unique value counts:
479     6
670     5
671     5
649     5
669     5
351     4
585     4
555     4
586     4
931     4
1017    4
386     4
741     4
845     3
745     3
646     3
898     3
413     3
412     3
978     3
800     3
52      3
718     3
550     3
79      2
89      2
101     2
100     2
716     2
902     2
901     2
774     2
720     2
211     2
423     2
724     2
88      2
103     2
422     2
421     2
83      2
964     2
483     2
80      2
78      2
77      2
76      2
157     2
648     2
74      2
105     2
549     2
925     2
628     2
916     2
128     2
215     2
144     2
122     2
145     2
146     2
849     2
641     2
618     2
642     2
705     2
706     2
645     2
110     2
905   

Let us see the amount of Pokémon that have each amount of forms.

In [211]:
print("Number of forms / Number of Pokémon with that amount of forms")
print(data['DexNumber'].value_counts().value_counts())

Number of forms / Number of Pokémon with that amount of forms
1    914
2     87
3     11
4      8
5      4
6      1
Name: DexNumber, dtype: int64


Let us see which Pokémons does have more than 2 forms (DexNumber + Name)

In [212]:
names = data.groupby(['DexNumber'])['Name'].unique()
for i, e in zip(names.index, names):
    if len(e) > 1:
        print(i, e)

19 ['Rattata' 'Alolan Rattata']
20 ['Raticate' 'Alolan Raticate']
26 ['Raichu' 'Alolan Raichu']
27 ['Sandshrew' 'Alolan Sandshrew']
28 ['Sandslash' 'Alolan Sandslash']
37 ['Vulpix' 'Alolan Vulpix']
38 ['Ninetales' 'Alolan Ninetales']
50 ['Diglett' 'Alolan Diglett']
51 ['Dugtrio' 'Alolan Dugtrio']
52 ['Meowth' 'Alolan Meowth' 'Galarian Meowth']
53 ['Persian' 'Alolan Persian']
58 ['Growlithe' 'Hisuian Growlithe']
59 ['Arcanine' 'Hisuian Arcanine']
74 ['Geodude' 'Alolan Geodude']
75 ['Graveler' 'Alolan Graveler']
76 ['Golem' 'Alolan Golem']
77 ['Ponyta' 'Galarian Ponyta']
78 ['Rapidash' 'Galarian Rapidash']
79 ['Slowpoke' 'Galarian Slowpoke']
80 ['Slowbro' 'Galarian Slowbro']
83 ["Farfetch'd" "Galarian Farfetch'd"]
88 ['Grimer' 'Alolan Grimer']
89 ['Muk' 'Alolan Muk']
100 ['Voltorb' 'Hisuian Voltorb']
101 ['Electrode' 'Hisuian Electrode']
103 ['Exeggutor' 'Alolan Exeggutor']
105 ['Marowak' 'Alolan Marowak']
110 ['Weezing' 'Galarian Weezing']
122 ['Mr. Mime' 'Galarian Mr. Mime']
128 ['Taur

### Feature Name

In [213]:
describe_feature(data, 'Name')

- Type: object
- First rows:
0       Victini
1     Bulbasaur
2       Ivysaur
3      Venusaur
4    Charmander
Name: Name, dtype: object
- Last rows:
1174               Iron Boulder
1175                 Iron Crown
1176      Terapagos Normal Form
1177    Terapagos Terastal Form
1178                  Pecharunt
Name: Name, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 1179
- Unique value counts:
Victini                           1
Florges Orange Flower             1
Aromatisse                        1
Spritzee                          1
Aegislash Blade Forme             1
Aegislash Shield Forme            1
Doublade                          1
Honedge                           1
Meowstic Female                   1
Meowstic Male                     1
Espurr                            1
Furfrou Natural Form              1
Pangoro                           1
Pancham                           1
Gogoat                            1
Skiddo                          

Every Pokémon and form have a different name. So this is more suitable to be the primary key rather than the DexNumber. That's funny

### Feature Type

In [214]:
describe_feature(data, 'Type')

- Type: object
- First rows:
0    ['Psychic', 'Fire']
1    ['Grass', 'Poison']
2    ['Grass', 'Poison']
3    ['Grass', 'Poison']
4               ['Fire']
Name: Type, dtype: object
- Last rows:
1174     ['Rock', 'Psychic']
1175    ['Steel', 'Psychic']
1176              ['Normal']
1177              ['Normal']
1178     ['Poison', 'Ghost']
Name: Type, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 217
- Unique value counts:
['Normal']                  80
['Water']                   80
['Grass']                   48
['Psychic']                 45
['Fire']                    37
['Electric']                35
['Fairy']                   34
['Normal', 'Flying']        30
['Fighting']                29
['Bug']                     25
['Ice']                     21
['Rock']                    19
['Ghost']                   18
['Ground']                  17
['Poison']                  16
['Dark']                    15
['Grass', 'Poison']         14
['Bug', 'Flying'

Before doing anything else, we are going to divide this feature into 2: Type1 and Type 2. If the Pokémon has only 1 type, Type2 will be exactly the same as for the first type.

In [215]:
data['Type'] = remove_first_last_letter(data['Type'], ',')
data['Type1'] = get_first_element(data['Type'])
data['Type2'] = get_second_element(data['Type'])
data.loc[data['Type2'] == "None", 'Type2'] = data['Type1']

describe_feature(data, 'Type1')
describe_feature(data, 'Type2')
data = data.drop('Type', axis=1)

- Type: object
- First rows:
0    Psychic
1      Grass
2      Grass
3      Grass
4       Fire
Name: Type1, dtype: object
- Last rows:
1174      Rock
1175     Steel
1176    Normal
1177    Normal
1178    Poison
Name: Type1, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 18
- Unique value counts:
Water       145
Normal      135
Grass       111
Bug          91
Psychic      76
Fire         72
Electric     71
Rock         64
Dark         53
Poison       48
Ground       46
Fighting     45
Fairy        44
Dragon       43
Ice          42
Steel        41
Ghost        40
Flying       12
Name: Type1, dtype: int64
- Unique values: ['Psychic' 'Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Dark' 'Poison'
 'Electric' 'Ground' 'Ice' 'Fairy' 'Steel' 'Fighting' 'Rock' 'Ghost'
 'Dragon' 'Flying']
- Type: object
- First rows:
0      Fire
1    Poison
2    Poison
3    Poison
4      Fire
Name: Type2, dtype: object
- Last rows:
1174    Psychic
1175    Psychic
1176     Normal
1177     N

Types are ordered, and it is important to consider it in any further analysis (although there are no in-game differences between being Normal,Ghost or Ghost,Normal types, the order does not matter (is much more like a set rather than a list))

### Feature Abilities

In [216]:
for elem in data['Abilities']:
    print(elem)

['Victory Star']
['Overgrow']
['Overgrow']
['Overgrow']
['Blaze']
['Blaze']
['Blaze']
['Torrent']
['Torrent']
['Torrent']
['Shield Dust']
['Shed Skin']
['Compound Eyes']
['Shield Dust']
['Shed Skin']
['Swarm']
['Keen Eye', 'Tangled Feet']
['Keen Eye', 'Tangled Feet']
['Keen Eye', 'Tangled Feet']
['Run Away', 'Guts']
['Gluttony', 'Hustle']
['Run Away', 'Guts']
['Gluttony', 'Hustle']
['Keen Eye']
['Keen Eye']
['Intimidate', 'Shed Skin']
['Intimidate', 'Shed Skin']
['Static']
['Static']
['Surge Surfer']
['Sand Veil']
['Snow Cloak']
['Sand Veil']
['Snow Cloak']
['Poison Point', 'Rivalry']
['Poison Point', 'Rivalry']
['Poison Point', 'Rivalry']
['Poison Point', 'Rivalry']
['Poison Point', 'Rivalry']
['Poison Point', 'Rivalry']
['Cute Charm', 'Magic Guard']
['Cute Charm', 'Magic Guard']
['Flash Fire']
['Snow Cloak']
['Flash Fire']
['Snow Cloak']
['Cute Charm', 'Competitive']
['Cute Charm', 'Competitive']
['Inner Focus']
['Inner Focus']
['Chlorophyll']
['Chlorophyll']
['Chlorophyll']
['Effect

In [217]:
data['Abilities'] = remove_first_last_letter(data['Abilities'], ',')
data['HiddenAbility'] = remove_first_last_letter(data['HiddenAbility'], ',')
data['Ability1'] = get_first_element(data['Abilities'])
data['Ability2'] = get_second_element(data['Abilities'])
data['HiddenAbility'] = get_first_element(data['HiddenAbility'])
data = data.drop('Abilities', axis=1)

In [218]:
describe_feature(data, 'Ability1')

- Type: object
- First rows:
0    Victory Star
1        Overgrow
2        Overgrow
3        Overgrow
4           Blaze
Name: Ability1, dtype: object
- Last rows:
1174         Quark Drive
1175         Quark Drive
1176          Tera Shift
1177          Tera Shell
1178    Poison Puppeteer
Name: Ability1, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 237
- Unique value counts:
Levitate            39
Chlorophyll         34
Swift Swim          31
Intimidate          31
Keen Eye            29
Overgrow            28
Pressure            28
Blaze               28
Torrent             28
Swarm               23
Sturdy              21
Run Away            20
Poison Point        17
Thick Fat           16
Flower Veil         16
Shed Skin           16
Pickup              15
Inner Focus         14
Water Absorb        13
Clear Body          13
Cute Charm          13
Static              13
Guts                13
Flash Fire          12
Natural Cure        12
Oblivious      

In [219]:
describe_feature(data, 'Ability2')

- Type: object
- First rows:
0    None
1    None
2    None
3    None
4    None
Name: Ability2, dtype: object
- Last rows:
1174    None
1175    None
1176    None
1177    None
1178    None
Name: Ability2, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 135
- Unique value counts:
None                591
Sturdy               18
Shell Armor          16
Own Tempo            15
Gluttony             14
Frisk                14
Inner Focus          11
Sap Sipper           10
Infiltrator          10
Hustle               10
Technician           10
Keen Eye              9
Ice Body              9
Hydration             9
Sniper                9
Flash Fire            9
Early Bird            9
Oblivious             8
Unburden              8
Rock Head             8
Static                8
Klutz                 8
Sheer Force           8
Unnerve               7
Flame Body            7
Competitive           7
Magic Guard           7
Serene Grace          6
Storm Drain       

In [220]:
print(data['Ability1'].unique())
print(data['Ability2'].unique())
print(data['HiddenAbility'].unique())

['Victory Star' 'Overgrow' 'Blaze' 'Torrent' 'Shield Dust' 'Shed Skin'
 'Compound Eyes' 'Swarm' 'Keen Eye' 'Run Away' 'Gluttony' 'Intimidate'
 'Static' 'Surge Surfer' 'Sand Veil' 'Snow Cloak' 'Poison Point'
 'Cute Charm' 'Flash Fire' 'Inner Focus' 'Chlorophyll' 'Effect Spore'
 'Pickup' 'Limber' 'Fur Coat' 'Damp' 'Vital Spirit' 'Water Absorb'
 'Synchronize' 'Guts' 'Clear Body' 'Rock Head' 'Magnet Pull' 'Oblivious'
 'Quick Draw' 'Steadfast' 'Thick Fat' 'Stench' 'Poison Touch'
 'Shell Armor' 'Levitate' 'Cursed Body' 'Insomnia' 'Hyper Cutter'
 'Soundproof' 'Frisk' 'Own Tempo' 'Lightning Rod' 'Natural Cure'
 'Early Bird' 'Swift Swim' 'Illuminate' 'Flame Body' 'Volt Absorb' 'Trace'
 'Immunity' 'Pressure' 'Competitive' 'Defiant' 'Berserk' 'Hustle' 'Sturdy'
 'Speed Boost' 'Curious Medicine' 'Shadow Tag' 'Serene Grace'
 'Magma Armor' 'Weak Armor' 'Suction Cups' 'Sand Stream' 'Truant'
 'Wonder Guard' 'Pure Power' 'Plus' 'Minus' 'Liquid Ooze' 'Rough Skin'
 'Water Veil' 'White Smoke' 'Battle Armor

We will replace Nan values with empty strings

In [221]:
data['Ability1'] = data['Ability1'].replace(['nan', ''], ['None', 'None'])
data['Ability2'] = data['Ability2'].replace(['nan', ''], ['None', 'None'])
data['HiddenAbility'] = data['HiddenAbility'].replace(['nan', ''], ['None', 'None'])
data['Ability1'] = data['Ability1'].fillna('None')
data['Ability2'] = data['Ability2'].fillna('None')
data['HiddenAbility'] = data['HiddenAbility'].fillna('None')

In [222]:
describe_feature(data, 'HiddenAbility')

- Type: object
- First rows:
0           None
1    Chlorophyll
2    Chlorophyll
3    Chlorophyll
4    Solar Power
Name: HiddenAbility, dtype: object
- Last rows:
1174    None
1175    None
1176    None
1177    None
1178    None
Name: HiddenAbility, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 167
- Unique value counts:
None                236
Telepathy            24
Sheer Force          22
Regenerator          20
Overcoat             19
Unnerve              18
Weak Armor           17
Infiltrator          16
Rattled              16
Symbiosis            16
Sand Force           15
Pickpocket           14
Sand Veil            14
Damp                 12
Technician           12
Analytic             12
Inner Focus          12
Swift Swim           12
Mold Breaker         11
Scrappy              11
Gluttony             11
Moxie                11
Defiant              11
Thick Fat            11
Iron Fist            10
Speed Boost          10
Sap Sipper           

### Feature Generation

In [223]:
describe_feature(data, 'Generation')

- Type: object
- First rows:
0    V
1    I
2    I
3    I
4    I
Name: Generation, dtype: object
- Last rows:
1174    IX
1175    IX
1176    IX
1177    IX
1178    IX
Name: Generation, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 9
- Unique value counts:
V       176
I       151
VIII    146
III     141
IX      136
IV      123
VII     116
II      100
VI       90
Name: Generation, dtype: int64
- Unique values: ['V' 'I' 'VII' 'VIII' 'IX' 'II' 'III' 'IV' 'VI']


We can here do a really basic preprocessing in order to obtain the integer number from the roman numeral:

In [224]:
data['Generation'] = data['Generation'].replace(['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX'], [1, 2, 3, 4, 5, 6, 7, 8, 9])
describe_feature(data, 'Generation')

- Type: int64
- First rows:
0    5
1    1
2    1
3    1
4    1
Name: Generation, dtype: int64
- Last rows:
1174    9
1175    9
1176    9
1177    9
1178    9
Name: Generation, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 9
- Unique value counts:
5    176
1    151
8    146
3    141
9    136
4    123
7    116
2    100
6     90
Name: Generation, dtype: int64
- Min: 1
- Mean: 4.995759117896522
- Median: 5.0
- Max: 9
- Std: 2.613628054861519


FALTA: Corregir el tema de la generación, las formas regionales no las hace bien, por tanto cambiar los "Galarian", "Alolian", etc. cada uno por su generación respectiva.

### Feature Hp

In [225]:
describe_feature(data, 'Hp')

- Type: int64
- First rows:
0    100
1     45
2     60
3     80
4     39
Name: Hp, dtype: int64
- Last rows:
1174    90
1175    90
1176    90
1177    95
1178    88
Name: Hp, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 109
- Unique value counts:
60     89
70     85
50     78
80     61
75     57
40     57
65     56
45     49
55     48
90     48
100    48
95     30
85     29
35     20
30     17
78     16
74     15
105    14
68     14
110    13
38     11
71     11
58     10
72     10
44     10
88      9
59      8
120     8
115     8
62      8
61      8
67      7
82      7
125     7
91      7
76      7
48      7
73      7
57      6
20      6
41      6
97      6
79      6
52      6
43      5
64      5
92      5
150     5
130     5
109     4
77      4
108     4
42      4
25      4
63      4
83      4
106     4
86      4
54      4
103     3
46      3
111     3
10      3
53      3
89      3
66      3
126     3
69      3
51      2
140     2
56      2
123     2
3

This feature is quite simple, just as it is, it represents the HP base state of that Pokémon. A higher value will mean higher HP stats.

### Feature Attack

In [226]:
describe_feature(data, 'Attack')

- Type: int64
- First rows:
0    100
1     49
2     62
3     82
4     52
Name: Attack, dtype: int64
- Last rows:
1174    120
1175     72
1176     65
1177     95
1178     88
Name: Attack, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 120
- Unique value counts:
100    64
65     64
80     52
85     50
60     50
70     48
55     48
75     47
50     46
90     44
120    41
95     41
45     38
40     32
30     26
115    25
105    24
110    21
130    20
125    20
35     19
92     11
135    10
48     10
20     10
38      8
140     8
73      8
82      8
52      8
64      8
63      8
53      8
25      8
72      8
83      7
160     7
78      6
69      6
84      6
131     6
98      6
62      6
58      5
150     5
81      5
29      5
112     5
68      5
61      4
117     4
96      4
101     4
123     4
89      4
67      4
76      4
41      4
66      4
77      4
56      4
47      4
107     4
103     3
87      3
108     3
59      3
71      3
127     3
86      3
57      

This feature is quite simple, just as it is, it represents the Attack base state of that Pokémon. A higher value will mean higher Attack stats.

### Feature Defense

In [227]:
describe_feature(data, 'Defense')

- Type: int64
- First rows:
0    100
1     49
2     63
3     83
4     43
Name: Defense, dtype: int64
- Last rows:
1174     80
1175    100
1176     85
1177    110
1178    160
Name: Defense, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 108
- Unique value counts:
70     80
60     72
50     68
80     61
65     58
90     54
40     52
45     50
100    49
55     48
95     43
75     40
85     36
35     31
115    22
30     20
110    18
105    17
120    16
130    15
67     13
48     13
63     10
58      9
62      8
72      8
140     8
77      8
68      8
78      8
20      7
125     7
53      7
39      6
84      6
107     6
43      6
52      6
86      5
97      5
51      5
91      5
66      5
57      5
71      5
79      5
44      4
150     4
76      4
135     4
88      4
15      4
99      4
41      4
64      4
25      4
69      4
131     4
145     4
42      4
83      4
73      4
37      4
160     3
49      3
38      3
34      3
121     3
54      3
92      3
89    

This feature is quite simple, just as it is, it represents the Defense base state of that Pokémon. A higher value will mean higher Defense stats.

### Feature SpecialAttack

In [228]:
describe_feature(data, 'SpecialAttack')

- Type: int64
- First rows:
0    100
1     65
2     80
3    100
4     60
Name: SpecialAttack, dtype: int64
- Last rows:
1174     68
1175    122
1176     65
1177    105
1178     88
Name: SpecialAttack, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 119
- Unique value counts:
40     76
60     71
50     64
65     60
55     54
45     52
70     46
80     45
100    41
95     41
35     40
30     39
85     38
90     31
125    27
75     26
105    23
120    21
110    20
25     16
130    12
135    11
53     11
61     11
20     10
115     9
83      9
81      8
63      8
150     8
74      7
98      7
112     7
59      7
62      7
58      7
29      6
145     6
91      5
44      5
92      5
54      5
97      5
73      5
43      4
69      4
57      4
109     4
68      4
114     4
10      4
48      4
140     4
72      4
56      4
86      4
79      4
15      4
87      4
77      3
37      3
137     3
103     3
127     3
170     3
131     3
78      3
42      3
47      3
71  

This feature is quite simple, just as it is, it represents the SpecialAttack base state of that Pokémon. A higher value will mean higher SpecialAttack stats.

### Feature SpecialDefense

In [229]:
describe_feature(data, 'SpecialDefense')

- Type: int64
- First rows:
0    100
1     65
2     80
3    100
4     50
Name: SpecialDefense, dtype: int64
- Last rows:
1174    108
1175    108
1176     85
1177    110
1178     88
Name: SpecialDefense, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 105
- Unique value counts:
70     73
50     71
80     66
60     63
65     63
55     60
75     59
45     52
90     52
40     51
95     42
100    40
85     39
35     29
30     28
105    21
110    17
25     16
120    16
115    13
63     11
48     10
130    10
79      9
86      8
56      8
20      7
51      7
154     7
107     7
82      7
96      6
150     6
81      6
53      6
58      6
128     6
61      6
135     6
52      6
62      6
72      5
89      5
71      5
67      5
54      5
41      5
42      5
98      4
101     4
125     4
108     4
83      4
37      4
73      4
69      4
140     4
87      4
77      3
66      3
116     3
38      3
43      3
97      3
64      3
49      3
31      3
44      3
117     2
92

This feature is quite simple, just as it is, it represents the SpecialDefense base state of that Pokémon. A higher value will mean higher SpecialDefense stats.

### Feature Speed

In [230]:
describe_feature(data, 'Speed')

- Type: int64
- First rows:
0    100
1     45
2     60
3     80
4     65
Name: Speed, dtype: int64
- Last rows:
1174    124
1175     98
1176     60
1177     85
1178     88
Name: Speed, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 125
- Unique value counts:
60     61
50     59
65     54
70     51
85     48
30     48
45     44
40     43
90     43
80     40
55     38
95     37
75     35
100    33
35     33
110    21
20     19
105    15
15     14
92     13
42     13
25     13
36     11
115    11
97     11
58      9
120     9
43      9
99      9
108     8
72      8
130     8
101     8
82      7
68      7
32      7
48      7
86      6
64      6
135     6
66      6
98      6
91      6
34      6
71      6
78      6
150     6
93      6
44      5
52      5
106     5
67      5
77      5
57      5
83      5
46      5
109     5
10      5
111     5
125     5
38      4
56      4
104     4
121     4
29      4
73      4
81      4
84      4
74      4
28      4
39      4


This feature is quite simple, just as it is, it represents the Speed base state of that Pokémon. A higher value will mean higher Speed stats.

### Feature TotalStats

In [231]:
describe_feature(data, 'TotalStats')

- Type: int64
- First rows:
0    600
1    318
2    405
3    625
4    309
Name: TotalStats, dtype: int64
- Last rows:
1174    590
1175    590
1176    700
1177    700
1178    600
Name: TotalStats, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 222
- Unique value counts:
600     43
580     33
500     31
570     30
490     29
300     27
405     26
485     23
530     21
420     21
495     20
680     20
480     19
510     19
475     19
525     19
330     18
310     17
700     16
305     16
410     15
520     15
505     15
460     14
470     14
540     13
320     13
290     12
455     11
335     11
280     11
440     10
350     10
325     10
340     10
590     10
535     10
430     10
550      9
450      9
515      9
250      9
270      8
355      8
303      8
555      8
390      8
245      7
315      7
400      7
210      7
275      7
425      7
385      6
360      6
308      6
465      6
370      6
265      6
295      6
476      6
435      6
534      5
240    

This feature represents the sum of all the six previous base stats for each Pokémon. This feature can be really informative about the overall power of that Pokémon.

In [232]:
for i in range(data.shape[0]):
    if not data.at[i, 'TotalStats'] == data.at[i, 'Hp'] + data.at[i, 'Attack'] + data.at[i, 'Defense'] + data.at[i, 'SpecialAttack'] + data.at[i, 'SpecialDefense'] + data.at[i, 'Speed']:
        print(data.at[i, 'Name'], data.at[i, 'TotalStats'],  data.at[i, 'Hp'] + data.at[i, 'Attack'] + data.at[i, 'Defense'] + data.at[i, 'SpecialAttack'] + data.at[i, 'SpecialDefense'] + data.at[i, 'Speed'])

Venusaur 625 525
Charizard 634 534
Blastoise 630 530
Beedrill 495 395
Pidgeot 579 479
Pikachu 430 320
Alakazam 600 500
Gengar 600 500
Kangaskhan 590 490
Pinsir 600 500
Gyarados 640 540
Eevee 435 325
Aerodactyl 615 515
Mewtwo 780 680
Ampharos 610 510
Steelix 610 510
Scizor 600 500
Heracross 600 500
Houndoom 600 500
Tyranitar 700 600
Sceptile 630 530
Blaziken 630 530
Swampert 635 535
Gardevoir 618 518
Sableye 480 380
Mawile 480 380
Aggron 630 530
Medicham 510 410
Manectric 575 475
Sharpedo 560 460
Camerupt 560 460
Altaria 590 490
Banette 555 455
Absol 565 465
Glalie 580 480
Salamence 700 600
Metagross 700 600
Latias 700 600
Latios 700 600
Kyogre 770 670
Groudon 770 670
Rayquaza 780 680
Lopunny 580 480
Garchomp 700 600
Lucario 625 525
Abomasnow 594 494
Gallade 618 518
Rotom 520 440
Audino 545 445
Darmanitan Standard Mode 540 480
Kyurem 700 660
Greninja 640 530
Zygarde 50% Forme 708 600
Zygarde 10% Forme 708 486
Diancie 700 600
Wishiwashi Solo Form 620 175
Minior Meteor Form 500 440
Necroz

It seems like Pokémon with mega evolutions or different forms. We are going to substitute them for their real values.

In [233]:
data['TotalStats'] = data['Hp'] + data['Attack'] + data['Defense'] + data['SpecialAttack'] + data['SpecialDefense'] + data['Speed']

### Feature Weight

In [234]:
describe_feature(data, 'Weight')

- Type: float64
- First rows:
0      4.0
1      6.9
2     13.0
3    100.0
4      8.5
Name: Weight, dtype: float64
- Last rows:
1174    162.5
1175    156.0
1176      6.5
1177      6.5
1178      0.3
Name: Weight, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 481
- Unique value counts:
30.0     15
0.3      15
1.0      13
6.5      13
5.0      12
8.0      12
120.0    12
28.0     12
15.0     11
4.0      11
8.5      11
12.0     10
6.0       9
0.1       9
40.0      9
19.5      9
60.0      9
18.0      9
2.0       8
25.0      8
11.0      8
3.4       8
9.0       8
10.0      8
3.5       7
11.5      7
1.5       7
2.5       7
9.5       7
35.0      7
16.0      7
12.5      7
10.5      6
105.0     6
3.0       6
17.5      6
0.8       6
1.2       6
210.0     6
5.5       6
1.8       6
20.0      6
110.0     6
55.0      6
90.0      5
230.0     5
32.0      5
79.5      5
29.5      5
0.5       5
0.9       5
32.5      5
38.0      5
2.1       5
33.0      5
7.0       5
7.5     

This is a simple feature indicating the weight (in kilograms) of the Pokémon. A higher value will mean heavier Pokémon.

### Feature Height

In [235]:
describe_feature(data, 'Height')

- Type: float64
- First rows:
0    0.4
1    0.7
2    1.0
3    2.0
4    0.6
Name: Height, dtype: float64
- Last rows:
1174    1.5
1175    1.6
1176    0.2
1177    0.2
1178    0.3
Name: Height, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 54
- Unique value counts:
0.6     102
0.3      95
0.4      82
1.0      78
0.5      77
1.2      72
1.5      62
0.8      57
0.7      53
1.1      48
0.9      43
1.6      42
1.3      42
1.4      41
0.2      40
2.0      33
1.8      33
1.7      29
1.9      23
3.0      12
2.1      12
2.5      11
0.1      11
2.2      10
2.4       7
2.3       7
3.5       5
2.8       4
4.5       4
3.2       4
2.9       3
2.7       3
3.8       3
3.6       3
5.0       3
5.5       2
4.2       2
9.2       2
5.4       2
5.2       2
4.0       2
3.7       1
3.3       1
7.0       1
2.6       1
6.2       1
14.5      1
5.8       1
3.9       1
3.4       1
6.5       1
8.8       1
20.0      1
12.0      1
Name: Height, dtype: int64
- Min: 0.1
- Mean: 1.20534

This is a simple feature indicating the height (in meters) of the Pokémon. A higher value will mean taller Pokémon.

### Feature GenderProbM

In [236]:
describe_feature(data, 'GenderProbM')

- Type: object
- First rows:
0        -
1    0.875
2    0.875
3    0.875
4    0.875
Name: GenderProbM, dtype: object
- Last rows:
1174      -
1175      -
1176    0.5
1177    0.5
1178      -
Name: GenderProbM, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 8
- Unique value counts:
0.5      709
-        193
0.875    137
0.0       56
0.25      31
1.0       30
0.75      21
0.125      2
Name: GenderProbM, dtype: int64
- Unique values: ['-' '0.875' '0.5' '0.0' '1.0' '0.25' '0.75' '0.125']


This feature indicates the proportion of male Pokémon of that species. The value '-' indicates that that Pokémon has no gender. We need to process it for use this as a continuous attribute. For this reason, we will replace those '-' values with 0.5 and add a column indicating that those Pokémon have no Gender.

In [237]:
data['NoGender'] = 0
data.loc[data['GenderProbM'] == '-', 'NoGender'] = 1
data['GenderProbM'] = data['GenderProbM'].replace(['-'], [0.5]).astype('float64')
describe_feature(data, 'GenderProbM')
describe_feature(data, 'NoGender')

- Type: float64
- First rows:
0    0.500
1    0.875
2    0.875
3    0.875
4    0.875
Name: GenderProbM, dtype: float64
- Last rows:
1174    0.5
1175    0.5
1176    0.5
1177    0.5
1178    0.5
Name: GenderProbM, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 7
- Unique value counts:
0.500    902
0.875    137
0.000     56
0.250     31
1.000     30
0.750     21
0.125      2
Name: GenderProbM, dtype: int64
- Min: 0.0
- Mean: 0.5297921967769296
- Median: 0.5
- Max: 1.0
- Std: 0.19161192761655535
- Type: int64
- First rows:
0    1
1    0
2    0
3    0
4    0
Name: NoGender, dtype: int64
- Last rows:
1174    1
1175    1
1176    0
1177    0
1178    1
Name: NoGender, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 2
- Unique value counts:
0    986
1    193
Name: NoGender, dtype: int64
- Min: 0
- Mean: 0.1636980491942324
- Median: 0.0
- Max: 1
- Std: 0.37015836158762744


### Feature Category

In [238]:
describe_feature(data, 'Category')

- Type: object
- First rows:
0    Victory Pokémon
1       Seed Pokémon
2       Seed Pokémon
3       Seed Pokémon
4     Lizard Pokémon
Name: Category, dtype: object
- Last rows:
1174        Paradox Pokémon
1175        Paradox Pokémon
1176           Tera Pokémon
1177           Tera Pokémon
1178    Subjugation Pokémon
Name: Category, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 718
- Unique value counts:
Paradox Pokémon          22
Mouse Pokémon            13
Single Bloom Pokémon     10
Dragon Pokémon            9
Fox Pokémon               9
Bagworm Pokémon           8
Season Pokémon            8
Flame Pokémon             6
Puppy Pokémon             6
Mushroom Pokémon          6
Plasma Pokémon            6
Poison Pin Pokémon        6
Balloon Pokémon           6
Mole Pokémon              5
Paleozoic Pokémon         5
Garden Pokémon            5
Fossil Pokémon            5
Ball Pokémon              4
DNA Pokémon               4
Tadpole Pokémon           4


This feature represents the Dex category of the Pokémon. We will use it as a categorical attribute.

### Feature CatchRate

In [239]:
describe_feature(data, 'CatchRate')

- Type: int64
- First rows:
0     3
1    45
2    45
3    45
4    45
Name: CatchRate, dtype: int64
- Last rows:
1174     10
1175     10
1176    255
1177    255
1178      3
Name: CatchRate, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 37
- Unique value counts:
45     351
3      111
190    109
255     95
75      87
120     77
60      68
90      54
30      35
200     21
225     19
25      18
50      15
180     15
100     14
10      13
150     10
235      8
127      8
70       6
140      6
5        6
55       4
6        4
125      3
65       3
80       3
20       3
130      2
15       2
220      2
170      2
155      1
145      1
35       1
160      1
205      1
Name: CatchRate, dtype: int64
- Min: 3
- Mean: 92.69974554707379
- Median: 60.0
- Max: 255
- Std: 76.09049717636522


Thishis feature indicates the catch rate of the Pokémon. Lower values indicate lower catch rates. We will use it as it is.

### Feature EggCycles

In [240]:
describe_feature(data, 'EggCycles')

- Type: int64
- First rows:
0    120
1     20
2     20
3     20
4     20
Name: EggCycles, dtype: int64
- Last rows:
1174    50
1175    50
1176     5
1177     5
1178    20
Name: EggCycles, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 11
- Unique value counts:
20     622
15     181
120     99
25      77
40      58
10      32
30      31
50      29
35      26
80      20
5        4
Name: EggCycles, dtype: int64
- Min: 5
- Mean: 30.966921119592875
- Median: 20.0
- Max: 120
- Std: 28.975103620459258


This attribute indicates es how many cycles it takes for a Pokémon to hatch from an egg. Lower values indicate faster hatching times. We will use it as it is.

### Feature EggGroup

In [241]:
describe_feature(data, 'EggGroup')

- Type: object
- First rows:
0                       -
1     ['Monster' 'Grass']
2     ['Monster' 'Grass']
3     ['Monster' 'Grass']
4    ['Monster' 'Dragon']
Name: EggGroup, dtype: object
- Last rows:
1174    -
1175    -
1176    -
1177    -
1178    -
Name: EggGroup, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 67
- Unique value counts:
['Field']                     221
-                             189
['Bug']                        75
['Mineral']                    63
['Flying']                     57
['Amorphous']                  49
['Human-Like']                 44
['Grass']                      40
['Fairy']                      36
['Water 1' 'Field']            27
['Water 2']                    27
['Monster' 'Dragon']           20
['Water 1']                    19
['Water 3']                    19
['Monster']                    19
['Monster' 'Water 1']          18
['Field' 'Fairy']              16
['Field' 'Human-Like']         14
['Dragon']    

We will process it like the types feature. Apart from it, there are several Pokémon whose egg group is non existent (Pokémon that cannot breed, or with "No Eggs Discovered" egg group), so we will substitue them.

In [242]:
data['EggGroup'] = remove_first_last_letter(data['EggGroup'], '')
data['EggGroup1'] = get_first_element(data['EggGroup'])
data['EggGroup2'] = get_second_element(data['EggGroup'])
data.loc[data['EggGroup1'] == '', 'EggGroup1'] = 'No Eggs Discovered'
data.loc[data['EggGroup2'] == 'None', 'EggGroup2'] = 'No Eggs Discovered'

describe_feature(data, 'EggGroup1')
describe_feature(data, 'EggGroup2')
data = data.drop('EggGroup', axis=1)

- Type: object
- First rows:
0    No Eggs Discovered
1               Monster
2               Monster
3               Monster
4               Monster
Name: EggGroup1, dtype: object
- Last rows:
1174    No Eggs Discovered
1175    No Eggs Discovered
1176    No Eggs Discovered
1177    No Eggs Discovered
1178    No Eggs Discovered
Name: EggGroup1, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 15
- Unique value counts:
Field                 281
No Eggs Discovered    189
Water 1               104
Bug                    91
Monster                85
Mineral                74
Flying                 73
Fairy                  60
Grass                  53
Amorphous              52
Human-Like             48
Water 2                31
Water 3                19
Dragon                 18
Ditto                   1
Name: EggGroup1, dtype: int64
- Unique values: ['No Eggs Discovered' 'Monster' 'Bug' 'Flying' 'Field' 'Fairy' 'Grass'
 'Water 1' 'Human-Like' 'Water 3' 'Minera

This feature needs to be processed just like the Types.

### Feature LevelingRate

In [243]:
data['LevelingRate'] = remove_last_letter(data['LevelingRate'])
describe_feature(data, 'LevelingRate')

- Type: object
- First rows:
0           Slow
1    Medium Slow
2    Medium Slow
3    Medium Slow
4    Medium Slow
Name: LevelingRate, dtype: object
- Last rows:
1174    Slow
1175    Slow
1176    Slow
1177    Slow
1178    Slow
Name: LevelingRate, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 6
- Unique value counts:
Medium Fast    484
Slow           298
Medium Slow    279
Fast            71
Erratic         33
Fluctuating     14
Name: LevelingRate, dtype: int64
- Unique values: ['Slow' 'Medium Slow' 'Medium Fast' 'Fast' 'Fluctuating' 'Erratic']


These are all possible categories for the Leveling Rate

### Feature BaseFrienship

In [244]:
describe_feature(data, 'BaseFriendship')

- Type: int64
- First rows:
0    100
1     70
2     70
3     70
4     70
Name: BaseFriendship, dtype: int64
- Last rows:
1174     0
1175     0
1176    50
1177    50
1178     0
Name: BaseFriendship, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 8
- Unique value counts:
70     763
50     195
0       98
35      80
100     20
140     12
90       8
20       3
Name: BaseFriendship, dtype: int64
- Min: 0
- Mean: 59.72858354537744
- Median: 70.0
- Max: 140
- Std: 22.928862208698874


This feature indicates the base friendship of the Pokémon. Lower values indicate lower base friendship. We will use it as it is.

### Feature IsLegendary

In [245]:
describe_feature(data, 'IsLegendary')

- Type: int64
- First rows:
0    0
1    0
2    0
3    0
4    0
Name: IsLegendary, dtype: int64
- Last rows:
1174    0
1175    0
1176    1
1177    1
1178    0
Name: IsLegendary, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 2
- Unique value counts:
0    1082
1      97
Name: IsLegendary, dtype: int64
- Min: 0
- Mean: 0.08227311280746395
- Median: 0.0
- Max: 1
- Std: 0.2748969679773158


This feature indicates wether the Pokémon is a Legendary Pokémon (1) or not (0). We will use it as it is.

### Feature IsMythical

In [246]:
describe_feature(data, 'IsMythical')

- Type: int64
- First rows:
0    1
1    0
2    0
3    0
4    0
Name: IsMythical, dtype: int64
- Last rows:
1174    0
1175    0
1176    0
1177    0
1178    1
Name: IsMythical, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 2
- Unique value counts:
0    1145
1      34
Name: IsMythical, dtype: int64
- Min: 0
- Mean: 0.02883799830364716
- Median: 0.0
- Max: 1
- Std: 0.16742204951428044


This feature indicates wether the Pokémon is a Mythical Pokémon (1) or not (0). We will use it as it is.

### Feature IsUltraBeast

In [247]:
describe_feature(data, 'IsUltraBeast')

- Type: int64
- First rows:
0    0
1    0
2    0
3    0
4    0
Name: IsUltraBeast, dtype: int64
- Last rows:
1174    0
1175    0
1176    0
1177    0
1178    0
Name: IsUltraBeast, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 2
- Unique value counts:
0    1168
1      11
Name: IsUltraBeast, dtype: int64
- Min: 0
- Mean: 0.009329940627650551
- Median: 0.0
- Max: 1
- Std: 0.09618076259995513


This feature indicates wether the Pokémon is an Ultra Beast (1) or not (0). We will use it as it is.

### Feature HasMega

In [248]:
describe_feature(data, 'HasMega')

- Type: int64
- First rows:
0    0
1    0
2    0
3    1
4    0
Name: HasMega, dtype: int64
- Last rows:
1174    0
1175    0
1176    0
1177    0
1178    0
Name: HasMega, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 2
- Unique value counts:
0    1132
1      47
Name: HasMega, dtype: int64
- Min: 0
- Mean: 0.03986429177268872
- Median: 0.0
- Max: 1
- Std: 0.19572332930837602


This feature indicates wether the Pokémon has a Mega Evolution (1) or not (0). We will use it as it is.

### Feature EvoStage

In [249]:
describe_feature(data, 'EvoStage')

- Type: int64
- First rows:
0    1
1    1
2    2
3    3
4    1
Name: EvoStage, dtype: int64
- Last rows:
1174    1
1175    1
1176    1
1177    1
1178    1
Name: EvoStage, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 3
- Unique value counts:
1    638
2    407
3    134
Name: EvoStage, dtype: int64
- Min: 1
- Mean: 1.5725190839694656
- Median: 1.0
- Max: 3
- Std: 0.687352156241489


This feature indicates the evolution stage of the Pokémon, with possible values being 1 (first evolution stage), 2 and 3 respectively. We will use it as it is.

### Feature TotalEvoStages

In [250]:
describe_feature(data, 'TotalEvoStages')

- Type: int64
- First rows:
0    1
1    3
2    3
3    3
4    3
Name: TotalEvoStages, dtype: int64
- Last rows:
1174    1
1175    1
1176    1
1177    1
1178    1
Name: TotalEvoStages, dtype: int64
- Number of missing values: 0
int64
- Number of distinct values: 3
- Unique value counts:
2    537
3    375
1    267
Name: TotalEvoStages, dtype: int64
- Min: 1
- Mean: 2.0916030534351147
- Median: 2.0
- Max: 3
- Std: 0.732525268127781


This feature indicates the maximum amount of the evolution line which the Pokémon belongs to. A value of 1 indicates that the Pokémon has no evolutions, a value of 2 and 3 indicates that there are 2 or 3 evolutions for the Pokémon's evolution line respectively. We will use it as it is.

### Feature PreevoName

In [251]:
describe_feature(data, 'PreevoName')

- Type: object
- First rows:
0    No Preevolution
1    No Preevolution
2          Bulbasaur
3            Ivysaur
4    No Preevolution
Name: PreevoName, dtype: object
- Last rows:
1174    No Preevolution
1175    No Preevolution
1176    No Preevolution
1177    No Preevolution
1178    No Preevolution
Name: PreevoName, dtype: object
- Number of missing values: 0
object
- Number of distinct values: 486
- Unique value counts:
No Preevolution                638
Eevee                            8
Flabébé                          5
Floette                          5
Darumaka                         3
Tyrogue                          3
Applin                           3
Rockruff                         3
Basculin White-Striped Form      2
Kubfu                            2
Slowpoke                         2
Galarian Slowpoke                2
Quilava                          2
Exeggcute                        2
Cubone                           2
Koffing                          2
Toxel           

This feature contains the name of the Preevolution of each Pokémon, in case that Pokémon has a preevolution. We will keep it as it is.

### Feature DamageFromNormal

In [252]:
describe_feature(data, 'DamageFromNormal')

- Type: float64
- First rows:
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: DamageFromNormal, dtype: float64
- Last rows:
1174    0.5
1175    0.5
1176    1.0
1177    1.0
1178    0.0
Name: DamageFromNormal, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 4
- Unique value counts:
1.00    945
0.50    148
0.00     79
0.25      7
Name: DamageFromNormal, dtype: int64
- Min: 0.0
- Mean: 0.8657760814249363
- Median: 1.0
- Max: 1.0
- Std: 0.289453212181203


This feature indicates the damage multiplier for Normal type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromFighting

In [253]:
describe_feature(data, 'DamageFromFighting')

- Type: float64
- First rows:
0    0.5
1    0.5
2    0.5
3    0.5
4    1.0
Name: DamageFromFighting, dtype: float64
- Last rows:
1174    1.0
1175    1.0
1176    2.0
1177    2.0
1178    0.0
Name: DamageFromFighting, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 6
- Unique value counts:
1.00    467
2.00    283
0.50    266
0.00     79
0.25     60
4.00     24
Name: DamageFromFighting, dtype: int64
- Min: 0.0
- Mean: 1.0831212892281594
- Median: 1.0
- Max: 4.0
- Std: 0.7542207852588667


This feature indicates the damage multiplier for Fighting type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromFlying

In [254]:
describe_feature(data, 'DamageFromFlying')

- Type: float64
- First rows:
0    1.0
1    2.0
2    2.0
3    2.0
4    1.0
Name: DamageFromFlying, dtype: float64
- Last rows:
1174    0.5
1175    0.5
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromFlying, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    704
2.00    260
0.50    185
4.00     15
0.25     15
Name: DamageFromFlying, dtype: int64
- Min: 0.25
- Mean: 1.1706955046649703
- Median: 1.0
- Max: 4.0
- Std: 0.5937814630649937


This feature indicates the damage multiplier for Flying type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromPoison

In [255]:
describe_feature(data, 'DamageFromPoison')

- Type: float64
- First rows:
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: DamageFromPoison, dtype: float64
- Last rows:
1174    0.50
1175    0.00
1176    1.00
1177    1.00
1178    0.25
Name: DamageFromPoison, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 6
- Unique value counts:
1.00    663
0.50    232
2.00    170
0.00     83
0.25     26
4.00      5
Name: DamageFromPoison, dtype: int64
- Min: 0.0
- Mean: 0.9715860899067006
- Median: 1.0
- Max: 4.0
- Std: 0.5601912222116779


This feature indicates the damage multiplier for Poison type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromGround

In [256]:
describe_feature(data, 'DamageFromGround')

- Type: float64
- First rows:
0    2.0
1    1.0
2    1.0
3    1.0
4    2.0
Name: DamageFromGround, dtype: float64
- Last rows:
1174    2.0
1175    2.0
1176    1.0
1177    1.0
1178    2.0
Name: DamageFromGround, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 6
- Unique value counts:
1.00    580
2.00    281
0.50    151
0.00    128
4.00     33
0.25      6
Name: DamageFromGround, dtype: int64
- Min: 0.0
- Mean: 1.1458863443596268
- Median: 1.0
- Max: 4.0
- Std: 0.782899007753477


This feature indicates the damage multiplier for Ground type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromRock

In [257]:
describe_feature(data, 'DamageFromRock')

- Type: float64
- First rows:
0    2.0
1    1.0
2    1.0
3    1.0
4    2.0
Name: DamageFromRock, dtype: float64
- Last rows:
1174    1.0
1175    0.5
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromRock, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    667
2.00    284
0.50    190
4.00     28
0.25     10
Name: DamageFromRock, dtype: int64
- Min: 0.25
- Mean: 1.2251908396946565
- Median: 1.0
- Max: 4.0
- Std: 0.6703373400442587


This feature indicates the damage multiplier for Rock type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromBug

In [258]:
describe_feature(data, 'DamageFromBug')

- Type: float64
- First rows:
0    1.0
1    1.0
2    1.0
3    1.0
4    0.5
Name: DamageFromBug, dtype: float64
- Last rows:
1174    2.00
1175    1.00
1176    1.00
1177    1.00
1178    0.25
Name: DamageFromBug, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    518
0.50    371
2.00    201
0.25     74
4.00     15
Name: DamageFromBug, dtype: int64
- Min: 0.25
- Mean: 1.0042408821034776
- Median: 1.0
- Max: 4.0
- Std: 0.631887379279677


This feature indicates the damage multiplier for Bug type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromGhost

In [259]:
describe_feature(data, 'DamageFromGhost')

- Type: float64
- First rows:
0    2.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: DamageFromGhost, dtype: float64
- Last rows:
1174    2.0
1175    2.0
1176    0.0
1177    0.0
1178    2.0
Name: DamageFromGhost, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.0    769
2.0    180
0.0    154
0.5     72
4.0      4
Name: DamageFromGhost, dtype: int64
- Min: 0.0
- Mean: 1.001696352841391
- Median: 1.0
- Max: 4.0
- Std: 0.5739067316059406


This feature indicates the damage multiplier for Ghost type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromSteel

In [260]:
describe_feature(data, 'DamageFromSteel')

- Type: float64
- First rows:
0    0.5
1    1.0
2    1.0
3    1.0
4    0.5
Name: DamageFromSteel, dtype: float64
- Last rows:
1174    2.0
1175    0.5
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromSteel, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    643
0.50    354
2.00    165
0.25     11
4.00      6
Name: DamageFromSteel, dtype: int64
- Min: 0.25
- Mean: 0.9980916030534351
- Median: 1.0
- Max: 4.0
- Std: 0.5160276690924519


This feature indicates the damage multiplier for Steel type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromFire

In [261]:
describe_feature(data, 'DamageFromFire')

- Type: float64
- First rows:
0    0.5
1    2.0
2    2.0
3    2.0
4    0.5
Name: DamageFromFire, dtype: float64
- Last rows:
1174    0.5
1175    2.0
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromFire, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    542
0.50    305
2.00    277
0.25     30
4.00     25
Name: DamageFromFire, dtype: int64
- Min: 0.25
- Mean: 1.1501272264631044
- Median: 1.0
- Max: 4.0
- Std: 0.6947245309851134


This feature indicates the damage multiplier for Fire type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromWater

In [262]:
describe_feature(data, 'DamageFromWater')

- Type: float64
- First rows:
0    2.0
1    0.5
2    0.5
3    0.5
4    2.0
Name: DamageFromWater, dtype: float64
- Last rows:
1174    2.0
1175    1.0
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromWater, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    641
0.50    319
2.00    185
0.25     18
4.00     16
Name: DamageFromWater, dtype: int64
- Min: 0.25
- Mean: 1.0508905852417303
- Median: 1.0
- Max: 4.0
- Std: 0.5941291474718582


This feature indicates the damage multiplier for Water type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromGrass

In [263]:
describe_feature(data, 'DamageFromGrass')

- Type: float64
- First rows:
0    0.50
1    0.25
2    0.25
3    0.25
4    0.50
Name: DamageFromGrass, dtype: float64
- Last rows:
1174    2.0
1175    0.5
1176    1.0
1177    1.0
1178    0.5
Name: DamageFromGrass, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    451
0.50    398
2.00    187
0.25    113
4.00     30
Name: DamageFromGrass, dtype: int64
- Min: 0.25
- Mean: 0.9942748091603053
- Median: 1.0
- Max: 4.0
- Std: 0.7254902065953254


This feature indicates the damage multiplier for Grass type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromElectric

In [264]:
describe_feature(data, 'DamageFromElectric')

- Type: float64
- First rows:
0    1.0
1    0.5
2    0.5
3    0.5
4    1.0
Name: DamageFromElectric, dtype: float64
- Last rows:
1174    1.0
1175    1.0
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromElectric, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 6
- Unique value counts:
1.00    605
0.50    235
2.00    230
0.00     86
0.25     13
4.00     10
Name: DamageFromElectric, dtype: int64
- Min: 0.0
- Mean: 1.0396522476675147
- Median: 1.0
- Max: 4.0
- Std: 0.6317899474582438


This feature indicates the damage multiplier for Electric type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromPsychic

In [265]:
describe_feature(data, 'DamageFromPsychic')

- Type: float64
- First rows:
0    0.5
1    2.0
2    2.0
3    2.0
4    1.0
Name: DamageFromPsychic, dtype: float64
- Last rows:
1174    0.50
1175    0.25
1176    1.00
1177    1.00
1178    2.00
Name: DamageFromPsychic, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 6
- Unique value counts:
1.00    770
0.50    171
2.00    142
0.00     82
0.25      9
4.00      5
Name: DamageFromPsychic, dtype: int64
- Min: 0.0
- Mean: 0.9853689567430025
- Median: 1.0
- Max: 4.0
- Std: 0.5183886115164228


This feature indicates the damage multiplier for Psychic type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromIce

In [266]:
describe_feature(data, 'DamageFromIce')

- Type: float64
- First rows:
0    0.5
1    2.0
2    2.0
3    2.0
4    0.5
Name: DamageFromIce, dtype: float64
- Last rows:
1174    1.0
1175    0.5
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromIce, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    531
0.50    304
2.00    296
4.00     34
0.25     14
Name: DamageFromIce, dtype: int64
- Min: 0.25
- Mean: 1.1997455470737914
- Median: 1.0
- Max: 4.0
- Std: 0.7364133954639672


This feature indicates the damage multiplier for Ice type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromDragon

In [267]:
describe_feature(data, 'DamageFromDragon')

- Type: float64
- First rows:
0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: DamageFromDragon, dtype: float64
- Last rows:
1174    1.0
1175    0.5
1176    1.0
1177    1.0
1178    1.0
Name: DamageFromDragon, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 4
- Unique value counts:
1.0    950
0.0     83
2.0     76
0.5     70
Name: DamageFromDragon, dtype: int64
- Min: 0.0
- Mean: 0.9643765903307888
- Median: 1.0
- Max: 2.0
- Std: 0.3854349700323602


This feature indicates the damage multiplier for Dragon type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromDark

In [268]:
describe_feature(data, 'DamageFromDark')

- Type: float64
- First rows:
0    2.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: DamageFromDark, dtype: float64
- Last rows:
1174    2.0
1175    2.0
1176    1.0
1177    1.0
1178    2.0
Name: DamageFromDark, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    786
0.50    209
2.00    172
0.25      8
4.00      4
Name: DamageFromDark, dtype: int64
- Min: 0.25
- Mean: 1.0623409669211197
- Median: 1.0
- Max: 4.0
- Std: 0.46995280163555914


This feature indicates the damage multiplier for Dark type attacks to that Pokémon. We will use it as it is.

### Feature DamageFromFairy

In [269]:
describe_feature(data, 'DamageFromFairy')

- Type: float64
- First rows:
0    0.5
1    0.5
2    0.5
3    0.5
4    0.5
Name: DamageFromFairy, dtype: float64
- Last rows:
1174    1.0
1175    0.5
1176    1.0
1177    1.0
1178    0.5
Name: DamageFromFairy, dtype: float64
- Number of missing values: 0
float64
- Number of distinct values: 5
- Unique value counts:
1.00    766
0.50    213
2.00    182
4.00     12
0.25      6
Name: DamageFromFairy, dtype: int64
- Min: 0.25
- Mean: 1.090754877014419
- Median: 1.0
- Max: 4.0
- Std: 0.5347946213850298


This feature indicates the damage multiplier for Fairy type attacks to that Pokémon. We will use it as it is.

## 2. Final asserts and data storage

First, we will ensure that there are no missing values

In [270]:
assert data.isna().sum().sum() == 0

And then we will sort the dataset columns and store it

In [271]:
data.columns

Index(['DexNumber', 'Name', 'HiddenAbility', 'Generation', 'Hp', 'Attack',
       'Defense', 'SpecialAttack', 'SpecialDefense', 'Speed', 'TotalStats',
       'Weight', 'Height', 'GenderProbM', 'Category', 'CatchRate', 'EggCycles',
       'LevelingRate', 'BaseFriendship', 'IsLegendary', 'IsMythical',
       'IsUltraBeast', 'HasMega', 'EvoStage', 'TotalEvoStages', 'PreevoName',
       'DamageFromNormal', 'DamageFromFighting', 'DamageFromFlying',
       'DamageFromPoison', 'DamageFromGround', 'DamageFromRock',
       'DamageFromBug', 'DamageFromGhost', 'DamageFromSteel', 'DamageFromFire',
       'DamageFromWater', 'DamageFromGrass', 'DamageFromElectric',
       'DamageFromPsychic', 'DamageFromIce', 'DamageFromDragon',
       'DamageFromDark', 'DamageFromFairy', 'Type1', 'Type2', 'Ability1',
       'Ability2', 'NoGender', 'EggGroup1', 'EggGroup2'],
      dtype='object')

In [272]:
data = data[['DexNumber', 'Name', 'Type1', 'Type2', 'Ability1',
       'Ability2', 'HiddenAbility', 'Generation', 'Hp', 'Attack', 'Defense',
       'SpecialAttack', 'SpecialDefense', 'Speed', 'TotalStats', 'Weight',
       'Height', 'GenderProbM', 'NoGender', 'Category', 'CatchRate', 'EggCycles',
       'EggGroup1', 'EggGroup2', 'LevelingRate', 'BaseFriendship', 'IsLegendary', 'IsMythical',
       'IsUltraBeast', 'HasMega', 'EvoStage', 'TotalEvoStages', 'PreevoName',
       'DamageFromNormal', 'DamageFromFighting', 'DamageFromFlying',
       'DamageFromPoison', 'DamageFromGround', 'DamageFromRock',
       'DamageFromBug', 'DamageFromGhost', 'DamageFromSteel', 'DamageFromFire',
       'DamageFromWater', 'DamageFromGrass', 'DamageFromElectric',
       'DamageFromPsychic', 'DamageFromIce', 'DamageFromDragon',
       'DamageFromDark', 'DamageFromFairy']]
data.to_csv('../data/PokemonPreproc.csv', index=False)