# Pokemon Scraping

# Setup

In [16]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from copy import copy

In [17]:
# This section is used to filter part of the data for debugging
VERBOSE = True
CSV_FILE_OUTPUT = 'pokemons.csv'

GENERATION_FIRST = 1
GENERATION_LAST  = 9

# Scripe every single pokemon page
SCRIPE_POKEMON_PAGE = True
SCRIPE_LEGENDARY_PAGE = True
SCRIPE_EVOLUTIONS = True
SCRIPE_RESISTANCES = False

# By default, legendaries are categorized as legendary, sublegendary or mythical
# Setting this to true will them all as 'legendaries'
COMBINE_ALL_LEGENDARIES = True

# We scripe the type list instead of a static declaration so that it
# doesn't break if pokemondb.net changes its column order
TYPE_LIST = []
pokemon_url = 'https://pokemondb.net/pokedex/bulbasaur'
html = requests.get(pokemon_url)
pokemon_data = BeautifulSoup(html.text, 'lxml')
for type_table in pokemon_data.find_all('table', class_='type-table'):
    for type in type_table.find_all('th'):
        TYPE_LIST.append(type.a.get('title').lower())
        
TYPE_NUMBERS = len(TYPE_LIST)

In [18]:
def url_to_soup(url) -> BeautifulSoup:
    html = requests.get(url)
    
    if html.status_code != 200:
        raise('Request Error')
    
    return BeautifulSoup(html.text, 'lxml')

# Scraping

## Single Page Pokemon

### About type resistances

At first I wanted to scrape the resistances from the pokemon page, however I realized that the resistances already compile the effect of the pokemon types and it's ability., which can be problematic.

In my project, I thus decided to recalculate the resistances based solely on the pokemon typing, as integrating the ability effects is beyond the scope of my analysis. However I left this in the script if people have a use for it.

#### Explaining the problem with scraping resistances
In some cases, if a pokemon can have different abilities, so it is possible to filter it out by taking the maximum for each resistance with the function `max_resistances`

Example for Koffing

![Resistances of Koffing with Levitate](images/koffing-resistances-levitate.png)

![Resistances of Koffing with Neutralizing Gas](images/koffing-resistances-neutralizing-gas.png)

In this example, the function would select *against_ground* = 2, effectively ignoring the *levitate* ability.

![Resistances of Haunter](images/haunter-resistances.png)

However, some pokemon only have one ability, which affects its resistances. It is for example the case with **Haunter**, which can only have the *levitate* ability.



In [19]:
def max_resistances(resistances: list[float]) -> list[float]:
    """
    Returns the highest set of resistances present on a page.
    This is used to ignore abilities that give extra resistances

    Args:
        resistances (list[float]): list of all the resistances, the length should be n*TYPE_NUMBERS

    Returns:
        list[float]: list TYPE_NUMBERS resistances
    """
    # If there's one list we return it
    if len(resistances) == TYPE_NUMBERS:
        return resistances
    
    # Otherwise we return the maximum, recursively
    resistances1 = resistances[:TYPE_NUMBERS]
    resistances2 = max_resistances(resistances[TYPE_NUMBERS:])
    
    return [max(a,b) for a,b in zip(resistances1,resistances2)]

In [20]:
def get_resistances(soup):
    type_resistances = []
    for type_table in soup.find_all('table', class_='type-table'):
        for strength_weakness in type_table.find_all('td'):
            strength_weakness_value = {
                '0': 0.0,
                '¼': 0.25,
                '½': 0.5,
                '2': 2.0,
                '4': 4.0,
            }.get(strength_weakness.text.strip(), 1.0)
            type_resistances.append(strength_weakness_value)
            
    return max_resistances(type_resistances)

### Scraping

In [21]:
def scrape_single_pokemon(name: str, id: int=0, form: str=None) -> list:
    pokemon = {}
    
    if id != 0:
        pokemon_url = f'https://pokemondb.net/pokedex/{id}'
    else:
        pokemon_url = f'https://pokemondb.net/pokedex/{name}'
    
    pokemon_data = url_to_soup(pokemon_url)
    
    all_pokemon_forms = pokemon_data.find_all('div', class_='sv-tabs-panel')
    
    # Selects the right tab based on the form studied
    tab_labels = pokemon_data.find('div', class_='sv-tabs-tab-list').find_all('a', class_='sv-tabs-tab', recursive=False)
    index = 0
    for i, label in enumerate(tab_labels):
        if label.get_text(strip=True) == form:
            index = i
                
    current_form = all_pokemon_forms[index]
    
    data = current_form.find_all('table', class_='vitals-table')
    
    vitals = data[0].find_all('td')
    abilities = vitals[5].find_all('span')
    pokemon['species']      = vitals[2].get_text(strip=True)
    pokemon['height_m']     = vitals[3].get_text(strip=True).replace('\xa0', ' ').partition(' m')[0]
    pokemon['weight_kg']    = vitals[4].get_text(strip=True).replace('\xa0', ' ').partition(' kg')[0]
    pokemon['ability_0']    = abilities[0].a.get_text(strip=True)
    pokemon['ability_1']    = abilities[0].a.get_text(strip=True) if len(abilities) > 1 else np.nan
    if vitals[5].find('small') is not None:
        pokemon['ability_hidden'] = vitals[5].find('small').get_text(strip=True).replace('(hidden ability)', '')
        
    training = data[1].find_all('td')
    pokemon['ev_yield']     = training[0].get_text(strip=True)
    pokemon['catch_rate']   = training[1].get_text(strip=True).partition('(')[0]
    pokemon['growth_rate']  = training[4].get_text(strip=True)
    
    breeding = data[2].find_all('td')
    pokemon['egg_groups'] = breeding[0].get_text(strip=True)
    if breeding[1].get_text(strip=True) != 'Genderless':
        male, female = breeding[1].find_all('span')
        pokemon['male_percent']   = float(male.get_text(strip=True).partition('%')[0])
        pokemon['female_percent'] = float(female.get_text(strip=True).partition('%')[0])
    else:
        pokemon['male_percent']   = np.nan
        pokemon['female_percent'] = np.nan
    if breeding[2].get_text(strip=True) != '—':
        pokemon['egg_cycles'] = int(breeding[2].get_text(strip=True).partition('(')[0])
        pokemon['egg_steps']  = pokemon['egg_cycles'] * 257
    else:
        pokemon['egg_cycles'] = np.nan
        pokemon['egg_steps']  = np.nan
    
    if SCRIPE_RESISTANCES:
        for type, resistance in zip(TYPE_LIST, get_resistances(current_form)):
            pokemon['against_'+type] = resistance
    
    return pokemon

# Tests the function
print(scrape_single_pokemon('Palkia', form='Origin Forme'))

{'species': 'Spatial Pokémon', 'height_m': '6.3', 'weight_kg': '660.0', 'ability_0': 'Pressure', 'ability_1': nan, 'ability_hidden': 'Telepathy', 'ev_yield': '3 Sp. Atk', 'catch_rate': '3', 'growth_rate': 'Slow', 'egg_groups': 'Undiscovered', 'male_percent': nan, 'female_percent': nan, 'egg_cycles': 120, 'egg_steps': 30840}


## Scrape 'em all

In [22]:
pokemons = []

# We loop over the different pages so we can infer the generation of a pokemon
for gen in range(GENERATION_FIRST, GENERATION_LAST+1):
    
    url = f'https://pokemondb.net/pokedex/stats/gen{gen}'
    pokemon_list_soup = url_to_soup(url)
    rows = pokemon_list_soup.find_all('tr')[1:] # Ignore first row because it contains the headers, not data

    for row in rows:
        
        cols = row.find_all('td')
                
        image       = cols[0].img.get('src')
        number      = int(cols[0].text.strip())
        name        = cols[1].a.text.strip()
        form        = cols[1].find('small').text.strip() if len(cols[1]) >= 3 else np.nan
        types       = cols[2].find_all('a', class_='type-icon')
        type1           = types[0].text
        type2           = types[1].text if len(types) > 1 else np.nan
        stats       = cols[3].text.strip()
        hp          = cols[4].text.strip()
        attack      = cols[5].text.strip()
        defense     = cols[6].text.strip()
        sp_attack   = cols[7].text.strip()
        sp_defense  = cols[8].text.strip()
        speed       = cols[9].text.strip()
        
        pokemon = {
            'id': number,
            'name': name,
            'form': form,
            'gen': gen,
            'type1': type1,
            'type2': type2,
            'stats': stats,
            'hp': hp,
            'attack': attack,
            'defense': defense,
            'sp_attack': sp_attack,
            'sp_defense': sp_defense,
            'speed': speed,
        }
        
        if SCRIPE_POKEMON_PAGE:
            pokemon |= scrape_single_pokemon(name=name, id=number)
        
        pokemons.append(pokemon)
        
        if VERBOSE:
            print(f'{number} {name}')

df = pd.DataFrame(pokemons)

1 Bulbasaur
2 Ivysaur
3 Venusaur
4 Charmander
5 Charmeleon
6 Charizard
7 Squirtle
8 Wartortle
9 Blastoise
10 Caterpie
11 Metapod
12 Butterfree
13 Weedle
14 Kakuna
15 Beedrill
16 Pidgey
17 Pidgeotto
18 Pidgeot
19 Rattata
20 Raticate
21 Spearow
22 Fearow
23 Ekans
24 Arbok
25 Pikachu
26 Raichu
27 Sandshrew
28 Sandslash
29 Nidoran♀
30 Nidorina
31 Nidoqueen
32 Nidoran♂
33 Nidorino
34 Nidoking
35 Clefairy
36 Clefable
37 Vulpix
38 Ninetales
39 Jigglypuff
40 Wigglytuff
41 Zubat
42 Golbat
43 Oddish
44 Gloom
45 Vileplume
46 Paras
47 Parasect
48 Venonat
49 Venomoth
50 Diglett
51 Dugtrio
52 Meowth
53 Persian
54 Psyduck
55 Golduck
56 Mankey
57 Primeape
58 Growlithe
59 Arcanine
60 Poliwag
61 Poliwhirl
62 Poliwrath
63 Abra
64 Kadabra
65 Alakazam
66 Machop
67 Machoke
68 Machamp
69 Bellsprout
70 Weepinbell
71 Victreebel
72 Tentacool
73 Tentacruel
74 Geodude
75 Graveler
76 Golem
77 Ponyta
78 Rapidash
79 Slowpoke
80 Slowbro
81 Magnemite
82 Magneton
83 Farfetch'd
84 Doduo
85 Dodrio
86 Seel
87 Dewgong
88 G

## Legendaries

In [23]:
if not SCRIPE_LEGENDARY_PAGE:
    raise RuntimeError('SCRIPE_LEGENDARY_PAGE not enabled in settings')

pokemon_tables = url_to_soup('https://www.serebii.net/pokemon/legendary.shtml')

categories = ['sublegendary', 'legendary', 'mythical']
for i, category in enumerate(pokemon_tables.find_all('table', class_='trainer')):
    df[categories[i]] = 0
    for pokemon in category.find_all('table'):
        name = pokemon.select_one('tr:nth-of-type(2)').text.strip()
        df.loc[ (df['name'] == name), categories[i]] = 1

In [24]:
if COMBINE_ALL_LEGENDARIES:
    mask = (df['sublegendary'] == 1) | (df['mythical'] == 1)
    df.loc[mask, 'legendary'] = 1
    df = df.drop(columns=['sublegendary','mythical'])

## Evolutions

Evolutions are a bit finnicky.
_Nincada_ in particular is treated manually because it is the only pokemon that can evolve and result in two pokemons (as of gen 9).

In [25]:
if not SCRIPE_EVOLUTIONS:
    raise RuntimeError('SCRIPE_EVOLUTIONS not enabled in settings')

# Nincada is a very special case because it can give you two pokemons
# It is the only pokemon having this behaviour (as of gen 9)
# We treat it manually rather than increase the complexity for a single case
def nincada():
    for name in ['Nincada', 'Ninjask','Shedinja']:
        mask = df['name'] == name
        df.loc[mask, 'evo_1'] = 'Nincada'
        if name != 'Shedinja':
            df.loc[mask, 'evo_2'] = 'Ninjask'
        else:
            df.loc[mask, 'evo_2'] = 'Shedinja'

def get_evo_line(soup, evos = [], forms = [], details = []):
    
    # Avoids directly modifying the global variables
    evos = copy(evos)
    forms = copy(forms)
    details = copy(details)
    
    for child in soup.findChildren(recursive=False):
        
        css_classes = child.get('class')
        
        # If the lines split, we will call the function again with the information already scraped
        # Returns because that evolution line will be treated in the recursion
        if 'infocard-evo-split' in css_classes:
            branches = child.find_all('div', class_='infocard-list-evo')
            for branch in branches:
                get_evo_line(branch, evos, forms, details)
            return
        
        # Stores the evolution conditions
        if 'infocard-arrow' in css_classes:
            details.append(child.text[1:-1])
            
        else:
            name = child.find('span', class_='infocard-lg-data').a.text
            
            # Special case, see function above
            if name == 'Nincada':
                nincada()
                return
            
            evos.append(name)
            
            # Line can get 3 <small>: id, (form,), types
            # Form is optional and won't be displayed if it isn't relevant
            # We thus need to count the number of <small> elements to see if it's relevant to look for the form
            html_small = child.find('span', class_='infocard-lg-data').find_all('small')
            if len(html_small) == 3:
                forms.append(html_small[1].text)
            else:
                forms.append(None)
                
    for name, form in zip(evos, forms):
        mask = (df['name'] == name)
        if form is None:
            mask &= (df['form'].isna())
        else:
            mask &= (df['form'] == form)
         
        df.loc[mask, 'evo_1'] = evos[0]
        df.loc[mask, 'evo_2'] = evos[1]
        if len(evos) == 3:
            df.loc[mask, 'evo_3'] = evos[2]



evolution_tables = url_to_soup('https://pokemondb.net/evolution')

# Sets all the evolution data in pokemons_df
evo_rows = evolution_tables.find_all('div', class_='infocard-filter-block')
for row in evo_rows:
    evo_lines = row.find_all('div', class_='infocard-list-evo', recursive=False)
    
    for evo_line in evo_lines:
        get_evo_line(evo_line)
        
        
# Single stage pokemons (not listed on the evolution page)
mask = df['evo_1'].isna()
df.loc[mask, 'evo_1'] = df['name']

# Sets the stage for each pokemon (1, 2 or 3)
for i in [1, 2, 3]:
    mask = df['name'] == df[f'evo_{i}']
    df.loc[mask, 'evo_stage'] = int(i)
    
# Determines if the pokemon is the final stage of its evolution line    
df['evo_is_final'] = 0
mask = df['evo_stage'] == 3
df.loc[mask, 'evo_is_final'] = 1
mask = (df['evo_stage'] == 2) & (df['evo_3'].isna())
df.loc[mask, 'evo_is_final'] = 1
mask = (df['evo_stage'] == 1) & (df['evo_2'].isna())
df.loc[mask, 'evo_is_final'] = 1
        

# Drop duplicates

We keep all the pokemons that have types or stats variations, but we drop the different forms that have the exact same numbers (example: Dudunsparce)

In [26]:
df = df.drop_duplicates(subset=[
    'name',
    'type1',
    'type2',
    'hp',
    'attack',
    'defense',
    'sp_attack',
    'sp_defense',
    'speed'
]).reset_index(drop=True)

# Results

In [27]:
df

Unnamed: 0,id,name,form,gen,type1,type2,stats,hp,attack,defense,...,male_percent,female_percent,egg_cycles,egg_steps,legendary,evo_1,evo_2,evo_3,evo_stage,evo_is_final
0,1,Bulbasaur,,1,Grass,Poison,318,45,49,49,...,87.5,12.5,20.0,5140.0,0,Bulbasaur,Ivysaur,Venusaur,1.0,0
1,2,Ivysaur,,1,Grass,Poison,405,60,62,63,...,87.5,12.5,20.0,5140.0,0,Bulbasaur,Ivysaur,Venusaur,2.0,0
2,3,Venusaur,,1,Grass,Poison,525,80,82,83,...,87.5,12.5,20.0,5140.0,0,Bulbasaur,Ivysaur,Venusaur,3.0,1
3,4,Charmander,,1,Fire,,309,39,52,43,...,87.5,12.5,20.0,5140.0,0,Charmander,Charmeleon,Charizard,1.0,0
4,5,Charmeleon,,1,Fire,,405,58,64,58,...,87.5,12.5,20.0,5140.0,0,Charmander,Charmeleon,Charizard,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,1023,Iron Crown,,9,Steel,Psychic,590,90,72,100,...,,,,,0,Iron Crown,,,1.0,1
1195,1024,Terapagos,Normal Form,9,Normal,,450,90,65,85,...,50.0,50.0,,,1,Terapagos,,,1.0,1
1196,1024,Terapagos,Terastal Form,9,Normal,,600,95,95,110,...,50.0,50.0,,,1,Terapagos,,,1.0,1
1197,1024,Terapagos,Stellar Form,9,Normal,,700,160,105,110,...,50.0,50.0,,,1,Terapagos,,,1.0,1


In [28]:
df.to_csv(CSV_FILE_OUTPUT)