# Scraping Data from Pokédex

In [1]:
"https://pokemondb.net/pokedex/national" #main page for Pokémon Pokédex

'https://pokemondb.net/pokedex/national'

In [2]:
from bs4 import BeautifulSoup
import requests

- get links for each pokemon from the main page
- check if pokemon has other forms (deoxys has attack, defense, speed. Some has mega evolution. Some have Galarian forms
- scrape pokedex data, training, breeding, base stats, type defenses, evolution chart

In [3]:
main_page = 'https://pokemondb.net/pokedex/national'
base_page = 'https://pokemondb.net'

In [4]:
main_soup = BeautifulSoup(requests.get(main_page).text, 'lxml')

In [5]:
#print(main_soup.prettify())

## Collecting all href values of each Pokémon

Links for each pokemon's individual stats under class='ent-name'

In [6]:
href_list = []

In [7]:
for link in main_soup.find_all(class_='ent-name',href=True):
    href_list.append(link['href'])

In [8]:
#Checking that matches with total number of pokemon (1010)
len(href_list)

1010

Using the first pokemon page, get all the headings to data that will be collected to for dataframe titles

In [9]:
pokemon_page = base_page + href_list[0]
pokemon_page

'https://pokemondb.net/pokedex/bulbasaur'

In [10]:
pokemon_soup = BeautifulSoup(requests.get(pokemon_page).text, 'lxml')

In [11]:
headings = pokemon_soup.find_all('h2')
headings

[<h2>Pokédex data</h2>,
 <h2>Training</h2>,
 <h2>Breeding</h2>,
 <h2>Base stats</h2>,
 <h2>Type defenses</h2>,
 <h2>Evolution chart</h2>,
 <h2>Bulbasaur changes</h2>,
 <h2>Pokédex entries</h2>,
 <h2>Moves learned by Bulbasaur</h2>,
 <h2>Bulbasaur sprites</h2>,
 <h2>Where to find Bulbasaur</h2>,
 <h2>Answers to Bulbasaur questions</h2>,
 <h2>Other languages</h2>,
 <h2> </h2>,
 <h2><a href="/etymology">Name origin</a></h2>]

In [12]:
heading_titles = [title.text.strip() for title in headings ]

In [13]:
#Only using 'Pokédex data','Training','Breeding','Base stats','Type defenses','Evolution chart'
heading_titles

['Pokédex data',
 'Training',
 'Breeding',
 'Base stats',
 'Type defenses',
 'Evolution chart',
 'Bulbasaur changes',
 'Pokédex entries',
 'Moves learned by Bulbasaur',
 'Bulbasaur sprites',
 'Where to find Bulbasaur',
 'Answers to Bulbasaur questions',
 'Other languages',
 '',
 'Name origin']

In [14]:
heading_titles = heading_titles[:5]
heading_titles

['Pokédex data', 'Training', 'Breeding', 'Base stats', 'Type defenses']

### Sections below will be heading 2s under the titles just collected

In [15]:
subtitles = []
for i, titles in enumerate(heading_titles):
    target_heading = pokemon_soup.find('h2', text=titles)
    next_sibling = target_heading.find_next()
    #next element names need to be table or div

    while next_sibling.name != 'table' and next_sibling.name != 'div':
        next_sibling = next_sibling.find_next()
        
    tags = next_sibling.find_all('th')
    #if it has a title value, use that instead of just the text
    set_text = []
    for th in tags: 
        a_tag = th.find('a', title=True)
        if a_tag:
            set_text.append(a_tag['title'])
        else:
            set_text.append(th.get_text())
    subtitles.append(set_text)

In [16]:
subtitles

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Attack',
  'Defense',
  'Sp. Atk',
  'Sp. Def',
  'Speed',
  'Total',
  '',
  'Min',
  'Max'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

In [17]:
#removing the min and max at end 
del subtitles[3][7:]
subtitles[3]

['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total']

In [18]:
#adding min and max stats after each stat
base_stats_titles = []
for i, title in enumerate(subtitles[3]):
    base_stats_titles.append(title)
    if(i<len(subtitles[3])-1):
        base_stats_titles.append("Min " + title)
        base_stats_titles.append("Max " + title)

In [19]:
subtitles[3]=base_stats_titles
subtitles

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Min HP',
  'Max HP',
  'Attack',
  'Min Attack',
  'Max Attack',
  'Defense',
  'Min Defense',
  'Max Defense',
  'Sp. Atk',
  'Min Sp. Atk',
  'Max Sp. Atk',
  'Sp. Def',
  'Min Sp. Def',
  'Max Sp. Def',
  'Speed',
  'Min Speed',
  'Max Speed',
  'Total'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

creating empty dataframe with headings

In [20]:
import pandas as pd

In [21]:
col_names = ['Name', 'Image']
for title in subtitles:
    col_names.extend(title)

In [30]:
df = pd.DataFrame(columns=col_names)
df

Unnamed: 0,Name,Image,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,...,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy


# Create Functions for each data section

In [23]:
#this function will take the soup and returns the names on the tabs for the pokemon
def FindNames(soup):
    names = soup.find(class_='sv-tabs-tab-list').text.strip().split('\n')
    return names

In [24]:
#this function will return the href for the image for the pokemon
def FindImage(soup):
    href = soup.find('img')['src']
    return href

In [25]:
#This function will return the data from the pokedex, training, breeding, and base stats tables
def TableData(table):
    stats = [] 
    for i, data in enumerate(table):
        if '\n\n' not in data.text:
            stats.append(data.text.strip())
    return stats

In [26]:
import re

In [27]:
#This function will return the type defenses for the pokemon
def TypeTable(table):
    def_stats = [] 
    for i, data in enumerate(table):
        text = str(data)
        substring = "fx-"

        # Find the index where the substring occurs
        start_index = text.find(substring)

        if start_index != -1:
            # Extract the characters after the substring that are numbers
            result = re.findall(r'\d+', text[start_index + len(substring):])
            def_stats.append(result[0])
        else:
            print("Substring not found")
    return def_stats

# Scrape Data

- for loop for each pokemon href from main page
- get list of names on tab
- for loop for each tab
    - get data for pokedex
    - get data for training
    - get data for breeding
    - get data for base stats
    - get data for defense types
    - combine data and append on to dataframe

In [31]:
for i,href in enumerate(href_list):
    pokemon_list = []
    #if i >= 26:
        #break
    #Make soup 
    page = base_page + href
    soup = BeautifulSoup(requests.get(page).text, 'lxml')
    
    #Get list of names on page tab
    names = FindNames(soup)
    
    image = soup.find('div', class_='sv-tabs-panel')
    
    for j, name in enumerate(names):
        pokemon = []
        pokemon.append(name)
        
        #Get image url 
        if(j==0):
            pokemon.append(FindImage(image))
        else:
            images = []
            next_sib = image.find_next_siblings('div',limit = len(names))
            images = FindImage(next_sib[j-1])
            pokemon.append(images)
            
        #Get pokedex, training, breeding, and base stats data
        tables = soup.find_all('table', class_='vitals-table')
        for k,table in enumerate(tables):
            if k >3:
                break
            pokemon.extend(TableData(table.find_all('td')))
            
        #Get defense types data
        def_stats = soup.find('div', class_='resp-scroll text-center').find_all('td')
        pokemon.extend(TypeTable(def_stats))
        
        #append on to df
        pokemon_list.append(pokemon)
        df.loc[len(df)] = pokemon

In [46]:
df

Unnamed: 0,Name,Image,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,...,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy
0,Bulbasaur,https://img.pokemondb.net/artwork/bulbasaur.jpg,0001,Grass Poison,Seed Pokémon,0.7 m (2′04″),6.9 kg (15.2 lbs),1. OvergrowChlorophyll (hidden ability),0001 (Red/Blue/Yellow)0226 (Gold/Silver/Crysta...,1 Sp. Atk,...,100,200,200,100,100,100,100,100,100,50
1,Ivysaur,https://img.pokemondb.net/artwork/ivysaur.jpg,0002,Grass Poison,Seed Pokémon,1.0 m (3′03″),13.0 kg (28.7 lbs),1. OvergrowChlorophyll (hidden ability),0002 (Red/Blue/Yellow)0227 (Gold/Silver/Crysta...,"1 Sp. Atk, 1 Sp. Def",...,100,200,200,100,100,100,100,100,100,50
2,Venusaur,https://img.pokemondb.net/artwork/venusaur.jpg,0003,Grass Poison,Seed Pokémon,2.0 m (6′07″),100.0 kg (220.5 lbs),1. OvergrowChlorophyll (hidden ability),0003 (Red/Blue/Yellow)0228 (Gold/Silver/Crysta...,"2 Sp. Atk, 1 Sp. Def",...,100,200,200,100,100,100,100,100,100,50
3,Mega Venusaur,https://img.pokemondb.net/artwork/venusaur-meg...,0003,Grass Poison,Seed Pokémon,2.0 m (6′07″),100.0 kg (220.5 lbs),1. OvergrowChlorophyll (hidden ability),0003 (Red/Blue/Yellow)0228 (Gold/Silver/Crysta...,"2 Sp. Atk, 1 Sp. Def",...,100,200,200,100,100,100,100,100,100,50
4,Charmander,https://img.pokemondb.net/artwork/charmander.jpg,0004,Fire,Lizard Pokémon,0.6 m (2′00″),8.5 kg (18.7 lbs),1. BlazeSolar Power (hidden ability),0004 (Red/Blue/Yellow)0229 (Gold/Silver/Crysta...,1 Speed,...,200,100,100,50,200,100,100,100,50,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189,Iron Valiant,https://img.pokemondb.net/artwork/iron-valiant...,1006,Fairy Fighting,Paradox Pokémon,1.4 m (4′07″),35.0 kg (77.2 lbs),1. Quark Drive,0398 (Scarlet/Violet),3 Attack,...,100,200,200,25,50,100,0,25,200,200
1190,Koraidon,https://img.pokemondb.net/artwork/koraidon.jpg,1007,Fighting Dragon,Paradox Pokémon,2.5 m (8′02″),303.0 kg (668.0 lbs),1. Orichalcum Pulse,0399 (Scarlet/Violet),3 Attack,...,100,200,200,50,50,100,200,50,100,400
1191,Miraidon,https://img.pokemondb.net/artwork/miraidon.jpg,1008,Electric Dragon,Paradox Pokémon,3.5 m (11′06″),240.0 kg (529.1 lbs),1. Hadron Engine,0400 (Scarlet/Violet),3 Sp. Atk,...,200,50,100,100,100,100,200,100,50,200
1192,Walking Wake,https://img.pokemondb.net/artwork/walking-wake...,1009,Water Dragon,Paradox Pokémon,3.5 m (11′06″),280.0 kg (617.3 lbs),1. Protosynthesis,—,—,...,100,100,100,100,100,100,200,100,50,200


# Data Cleaning

In [48]:
df.isna().sum()

Name               0
Image              0
National №         0
Type               0
Species            0
Height             0
Weight             0
Abilities          0
Local №            0
EV yield           0
Catch rate         0
Base Friendship    0
Base Exp.          0
Growth Rate        0
Egg Groups         0
Gender             0
Egg cycles         0
HP                 0
Min HP             0
Max HP             0
Attack             0
Min Attack         0
Max Attack         0
Defense            0
Min Defense        0
Max Defense        0
Sp. Atk            0
Min Sp. Atk        0
Max Sp. Atk        0
Sp. Def            0
Min Sp. Def        0
Max Sp. Def        0
Speed              0
Min Speed          0
Max Speed          0
Total              0
Normal             0
Fire               0
Water              0
Electric           0
Grass              0
Ice                0
Fighting           0
Poison             0
Ground             0
Flying             0
Psychic            0
Bug          

Even though none of the cells have null values, some have '—'

In [49]:
df.columns[df.eq('—').any()]

Index(['Local №', 'EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.',
       'Growth Rate', 'Gender', 'Egg cycles'],
      dtype='object')

In [52]:
pd.set_option('display.max_columns', None)
df[df.eq('—').any(axis=1)]

Unnamed: 0,Name,Image,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,Catch rate,Base Friendship,Base Exp.,Growth Rate,Egg Groups,Gender,Egg cycles,HP,Min HP,Max HP,Attack,Min Attack,Max Attack,Defense,Min Defense,Max Defense,Sp. Atk,Min Sp. Atk,Max Sp. Atk,Sp. Def,Min Sp. Def,Max Sp. Def,Speed,Min Speed,Max Speed,Total,Normal,Fire,Water,Electric,Grass,Ice,Fighting,Poison,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy
1192,Walking Wake,https://img.pokemondb.net/artwork/walking-wake...,1009,Water Dragon,Paradox Pokémon,3.5 m (11′06″),280.0 kg (617.3 lbs),1. Protosynthesis,—,—,—,—,—,—,Undiscovered,—,—,99,308,402,83,153,291,91,168,309,125,229,383,83,153,291,109,200,348,590,100,25,25,100,100,100,100,100,100,100,100,100,100,100,200,100,50,200
1193,Iron Leaves,https://img.pokemondb.net/artwork/iron-leaves.jpg,1010,Grass Psychic,Paradox Pokémon,1.5 m (4′11″),125.0 kg (275.6 lbs),1. Quark Drive,—,—,—,—,—,—,Undiscovered,—,—,90,290,384,130,238,394,88,162,302,70,130,262,108,198,346,104,191,337,590,100,200,50,50,50,200,50,200,50,200,50,400,100,200,100,200,100,100


Will replace with 'n/a' since it's only 2 rows 

# Testing functions

In [78]:
#To split when abilities have hidden 
text = "1. BlazeSolar Power (hidden ability)"

for i in range(len(text) - 1):
    if text[i].islower() and text[i+1].isupper():
        index = i
        break

print("Index:", index)

Index: 7
