<a href="https://colab.research.google.com/github/DanielRabinovitz/pokemon_classifier/blob/main/datacleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#This block downloads the dataset, cleans out useless data, and adds a column for ranks to be scored numerically

import pandas as pd
import io

dataset_url = 'https://raw.githubusercontent.com/DanielRabinovitz/pokemon_classifier/main/gen8pokemondata%20(1).csv'
df = pd.read_csv(dataset_url)

#dictionary to set each ranking to a number
ranks_dict = {
    '(PU)' : 0,
    'PU' : 1,
    'PUBL' : 1.5,
    'NU' : 2,
    'NUBL' : 2.5,
    'RU' : 3,
    'RUBL' : 3.5,
    'UU' : 4,
    'UUBL' : 4.5,
    'OU' : 5,
    'Uber' : 6
}

#ranks to remove
other_ranks = ['LC', 'NFE']

#get ranks of all mons
rank_col = df['Rank']

#assign false to items in the other_ranks list, true to the rest
bool_col = []
for rank in rank_col:
  bool_col.append(rank not in other_ranks)

#insert the bool column
df.insert(1, 'valid_rank', bool_col)

#get rid of unused rankings, then remove valid_rank
df = df.loc[df.valid_rank]
df.pop('valid_rank')

#get rid of extra data
#get the columns
placeholder = df.loc[:, 'height':'number_pokemon_with_typing']
cols = placeholder.columns
#pop the columns
for col in cols:
  df.pop(col)
df.pop('pokedex_number')

#get ranks again
rank_col = df['Rank']

#make list of number-to-rank conversions
numbered_ranks = []
for rank in rank_col:
  numbered_ranks.append(ranks_dict[rank])

#add rank numbers
df.insert(1, 'numbered_rank', numbered_ranks)

df.head()

Unnamed: 0,Rank,numbered_rank,name,abilities,typing,hp,attack,defense,special_attack,special_defense,speed,defense_vs_normal,defense_vs_fire,defense_vs_water,defense_vs_electric,defense_vs_grass,defense_vs_ice,defense_vs_fighting,defense_vs_poison,defense_vs_ground,defense_vs_flying,defense_vs_psychic,defense_vs_bug,defense_vs_rock,defense_vs_ghost,defense_vs_dragon,defense_vs_dark,defense_vs_steel,defense_vs_fairy
2,RUBL,3.5,Venusaur,Overgrow~Chlorophyll,Grass~Poison,80,82,83,100,100,80,1.0,2.0,0.5,0.5,0.25,2.0,0.5,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5
5,PU,1.0,Charizard,Blaze~Solar Power,Fire~Flying,78,84,78,109,85,100,1.0,0.5,2.0,2.0,0.25,1.0,0.5,1.0,0.0,1.0,1.0,0.25,4.0,1.0,1.0,1.0,0.5,0.5
8,NU,2.0,Blastoise,Torrent~Rain Dish,Water,79,83,100,85,105,78,1.0,0.5,0.5,2.0,2.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0
11,(PU),0.0,Butterfree,Compound Eyes~Tinted Lens,Bug~Flying,60,45,50,90,80,70,1.0,2.0,1.0,2.0,0.25,2.0,0.25,1.0,0.0,2.0,1.0,0.5,4.0,1.0,1.0,1.0,1.0,1.0
13,(PU),0.0,Raichu,Static~Lightning Rod,Electric,60,90,55,90,80,110,1.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,2.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,0.5,1.0


In [8]:
#list of types in order
type_list = ['bug', 'dark', 'dragon', 
         'electric', 'fairy', 'fight',
         'fire', 'flying', 'ghost',
         'grass', 'ground', 'ice',
         'normal', 'poison', 'psychic',
         'rock', 'steel', 'water']

#make dictionaries so that we can switch between row/column indices easily
type_dict = {}
reverse_type_dict = {}

for word in type_list:
  type_dict[word]=type_list.index(word)
  reverse_type_dict[type_list.index(word)] = word

#make dataframe of type attack multipliers
type_mat_url = 'https://raw.githubusercontent.com/DanielRabinovitz/pokemon_classifier/main/type_matrix.csv'
type_matrix = pd.read_csv(type_mat_url)
type_matrix.columns = type_matrix.columns.str.strip()
type_matrix.head()

Unnamed: 0,bug,dark,dragon,electric,fairy,fighting,fire,flying,ghost,grass,ground,water,steel,rock,psychic,poison,normal,ice,ice.1
0,1.0,1.0,1.0,1.0,1.0,0.5,2.0,2.0,1.0,0.5,0.5,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
1,2.0,0.5,1.0,1.0,1.0,2.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
2,1.0,1.0,2.0,0.5,2.0,1.0,0.5,1.0,1.0,0.5,1.0,0.5,1.0,1.0,1.0,1.0,1.0,2.0,2.0
3,1.0,1.0,1.0,0.5,1.0,1.0,1.0,0.5,1.0,1.0,2.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0
4,0.5,0.5,0.0,1.0,1.0,0.5,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0


In [9]:
#Gets us the offensive effectiveness of each type/type combo

test_dual = 'Grass~Ground'
test_single = 'Water'

def offensive_calcs(type_string):

  type_string = type_string.lower()

  #dual types have a ~ seperator
  if('~' in type_string):
    #seperate types using partition
    types = type_string.partition('~')
    type1 = types[0]
    type2 = types[2]

    #get vector of attack multipliers from type_matrix for each type
    type_1_numbers = type_matrix[type1]
    type_2_numbers = type_matrix[type2]
    
    #placeholder list of merger
    merged_numbers = []
    
    #at each type compare the two lists, picking the bigger number each time
    #there are 18 types, each list is 18 long
    for i in range(18):
      merged_numbers.append(max(type_1_numbers[i], type_2_numbers[i]))

    return merged_numbers

  #for a single type just return its attack multiplier column
  else:
    return type_matrix[type_string]



In [10]:
#adding in offensive types

#add empty rows for each type
offense_cols = []
for t in type_list:
  df['offense_vs_'+t]=0
  offense_cols.append('offense_vs_'+t)

#for each row in the df
for index in df.index:
  #get their offensive multipliers
  calcs = offensive_calcs(df.loc[index, 'typing'])

  #edit type multipliers
  for i in range(len(offense_cols)):
    x=i-1
    df.loc[index, offense_cols[i]] = calcs[i]


  

In [11]:
#type chart for reference
#https://gamesmeta.com/wp-content/uploads/2019/11/pokemon-sword-and-shield-type-chart.png

In [18]:
#get cols to sum for bst
#base stat total: the sum of hp, speed, attacks, and defenses
placeholder = df.loc[:, 'hp':'speed']
cols = placeholder.columns

#make bst column
if 'bst' not in df:
  df.insert(11, 'bst', df[cols].sum(axis=1), allow_duplicates=False)

#get cols to sum for ttd
#type total defense: the sum of the pokemon's defensive multipliers
#A higher ttd is bad, it means the pokemon has more weaknesses
placeholder = df.loc[:, 'defense_vs_normal':'defense_vs_fairy']
cols = placeholder.columns

#make ttd column
if 'ttd' not in df:
  df.insert(30, 'ttd', df[cols].sum(axis=1), allow_duplicates=False)

#get cols to sum for tto
#type total offense: the sum of the pokemon's offensive multipliers
#A higher tto is good, it means the pokemon hits more types for harder
placeholder = df.loc[:, 'offense_vs_bug':'offense_vs_water']
cols = placeholder.columns

#type total offense, adds up offens
if 'tto' not in df:
  df.insert(49, 'tto', df[cols].sum(axis=1), allow_duplicates=False)

df.head()

from google.colab import files
df.to_csv('pokemondata_clean.csv')
files.download('pokemondata_clean.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
#Descriptive stats using bsts and type scores
import statistics

Uber = df.loc[df.Rank=='Uber']
OU = df.loc[df.Rank=='OU']
UU = df.loc[df.Rank=='UU']
RU = df.loc[df.Rank=='RU']
NU = df.loc[df.Rank=='NU']
PU = df.loc[df.Rank=='PU']
sub_PU = df.loc[df.Rank=='(PU)']

print('Uber bst median:', statistics.median(Uber.bst))
print('OU bst median:', statistics.median(OU.bst))
print('UU bst median:', statistics.median(UU.bst))
print('RU bst median:', statistics.median(RU.bst))
print('NU bst median:', statistics.median(NU.bst))
print('PU bst median:', statistics.median(PU.bst))
print('Untiered bst median:', statistics.median(sub_PU.bst))
print('\n')
print('Uber ttd median:', statistics.median(Uber.ttd))
print('OU ttd median:', statistics.median(OU.ttd))
print('UU ttd median:', statistics.median(UU.ttd))
print('RU ttd median:', statistics.median(RU.ttd))
print('NU ttd median:', statistics.median(NU.ttd))
print('PU ttd median:', statistics.median(PU.ttd))
print('Untiered ttd median:', statistics.median(sub_PU.ttd))
print('\n')
print('Uber tto median:', statistics.median(Uber.tto))
print('OU tto median:', statistics.median(OU.tto))
print('UU tto median:', statistics.median(UU.tto))
print('RU tto median:', statistics.median(RU.tto))
print('NU tto median:', statistics.median(NU.tto))
print('PU tto median:', statistics.median(PU.tto))
print('Untiered tto median:', statistics.median(sub_PU.tto))





Uber bst median: 680
OU bst median: 570.0
UU bst median: 530
RU bst median: 525
NU bst median: 500.0
PU bst median: 480
Untiered bst median: 485.0


Uber ttd median: 18.25
OU ttd median: 18.0
UU ttd median: 18.75
RU ttd median: 18.0
NU ttd median: 18.5
PU ttd median: 19.0
Untiered ttd median: 19.375


Uber tto median: 21.5
OU tto median: 22.5
UU tto median: 23.0
RU tto median: 22.0
NU tto median: 21.25
PU tto median: 21.5
Untiered tto median: 20.0
