# Scraping Data from Pokédex

In [1]:
"https://pokemondb.net/pokedex/national" #main page for Pokémon Pokédex

'https://pokemondb.net/pokedex/national'

In [2]:
from bs4 import BeautifulSoup
import requests

- get links for each pokemon from the main page
- check if pokemon has other forms (deoxys has attack, defense, speed. Some has mega evolution. Some have Galarian forms
- scrape pokedex data, training, breeding, base stats, type defenses, evolution chart

In [3]:
main_page = 'https://pokemondb.net/pokedex/national'
base_page = 'https://pokemondb.net'

In [4]:
main_soup = BeautifulSoup(requests.get(main_page).text, 'lxml')

In [5]:
print(main_soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Pokémon (sprites gallery) | Pokémon Database
  </title>
  <link href="https://img.pokemondb.net" rel="preconnect"/>
  <link href="https://s.pokemondb.net" rel="preconnect"/>
  <style>
   @font-face{font-family:"Fira Sans";font-style:normal;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:italic;font-weight:400;font-display:swap;src:url("/static/fonts/fira-sans-v10-latin-400i.woff2") format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"Fira Sans";font-style:normal;font-weight:700;font-display:swap;src:url("/stati

## Collecting all href values of each Pokémon

Links for each pokemon's individual stats under class='ent-name'

In [6]:
href_list = []

In [7]:
for link in main_soup.find_all(class_='ent-name',href=True):
    href_list.append(link['href'])

In [8]:
#Checking that matches with total number of pokemon (1010)
len(href_list)

1010

Using the first pokemon page, get all the headings to data that will be collected to for dataframe titles

In [9]:
pokemon_page = base_page + href_list[0]
pokemon_page

'https://pokemondb.net/pokedex/bulbasaur'

In [10]:
pokemon_soup = BeautifulSoup(requests.get(pokemon_page).text, 'lxml')

In [11]:
titles = pokemon_soup.find_all('h2')
titles

[<h2>Pokédex data</h2>,
 <h2>Training</h2>,
 <h2>Breeding</h2>,
 <h2>Base stats</h2>,
 <h2>Type defenses</h2>,
 <h2>Evolution chart</h2>,
 <h2>Bulbasaur changes</h2>,
 <h2>Pokédex entries</h2>,
 <h2>Moves learned by Bulbasaur</h2>,
 <h2>Bulbasaur sprites</h2>,
 <h2>Where to find Bulbasaur</h2>,
 <h2>Answers to Bulbasaur questions</h2>,
 <h2>Other languages</h2>,
 <h2> </h2>,
 <h2><a href="/etymology">Name origin</a></h2>]

In [12]:
titles1 = [title.text.strip() for title in titles ]

In [13]:
#Only using 'Pokédex data','Training','Breeding','Base stats','Type defenses','Evolution chart'
titles1

['Pokédex data',
 'Training',
 'Breeding',
 'Base stats',
 'Type defenses',
 'Evolution chart',
 'Bulbasaur changes',
 'Pokédex entries',
 'Moves learned by Bulbasaur',
 'Bulbasaur sprites',
 'Where to find Bulbasaur',
 'Answers to Bulbasaur questions',
 'Other languages',
 '',
 'Name origin']

In [14]:
titles1 = titles1[:5]
titles1

['Pokédex data', 'Training', 'Breeding', 'Base stats', 'Type defenses']

### Sections below will be heading 2s under the titles just collected

In [153]:
titles2 = []
for i, titles in enumerate(titles1):
    target_heading = pokemon_soup.find('h2', text=titles)
    next_sibling = target_heading.find_next()
    #next element names need to be table or div

    while next_sibling.name != 'table' and next_sibling.name != 'div':
        next_sibling = next_sibling.find_next()
        
    tags = next_sibling.find_all('th')
    #if it has a title value, use that instead of just the text
    set_text = []
    for th in tags: 
        a_tag = th.find('a', title=True)
        if a_tag:
            set_text.append(a_tag['title'])
        else:
            set_text.append(th.get_text())
    titles2.append(set_text)

In [154]:
titles2

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Attack',
  'Defense',
  'Sp. Atk',
  'Sp. Def',
  'Speed',
  'Total',
  '',
  'Min',
  'Max'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

In [155]:
#removing the min and max at end 
del titles2[3][7:]
titles2[3]

['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total']

In [157]:
#adding min and max stats after each stat
base_stats_titles = []
for i, title in enumerate(titles2[3]):
    base_stats_titles.append(title)
    if(i<len(titles2[3])-1):
        base_stats_titles.append("Min " + title)
        base_stats_titles.append("Max " + title)
base_stats_titles

['HP',
 'Min HP',
 'Max HP',
 'Attack',
 'Min Attack',
 'Max Attack',
 'Defense',
 'Min Defense',
 'Max Defense',
 'Sp. Atk',
 'Min Sp. Atk',
 'Max Sp. Atk',
 'Sp. Def',
 'Min Sp. Def',
 'Max Sp. Def',
 'Speed',
 'Min Speed',
 'Max Speed',
 'Total']

In [158]:
titles2[3]=base_stats_titles
titles2

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Min HP',
  'Max HP',
  'Attack',
  'Min Attack',
  'Max Attack',
  'Defense',
  'Min Defense',
  'Max Defense',
  'Sp. Atk',
  'Min Sp. Atk',
  'Max Sp. Atk',
  'Sp. Def',
  'Min Sp. Def',
  'Max Sp. Def',
  'Speed',
  'Min Speed',
  'Max Speed',
  'Total'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

In [159]:
#Creating so that column and sub column names can be created in a dataframe 
titles = []
for i,title in enumerate(titles1):
    if(len(titles2[i])==0):
        titles.append((title,''))
    else:
        for subtitle in titles2[i]:
            sub = (title, subtitle)
            titles.append(sub)
titles

[('Pokédex data', 'National №'),
 ('Pokédex data', 'Type'),
 ('Pokédex data', 'Species'),
 ('Pokédex data', 'Height'),
 ('Pokédex data', 'Weight'),
 ('Pokédex data', 'Abilities'),
 ('Pokédex data', 'Local №'),
 ('Training', 'EV yield'),
 ('Training', 'Catch rate'),
 ('Training', 'Base Friendship'),
 ('Training', 'Base Exp.'),
 ('Training', 'Growth Rate'),
 ('Breeding', 'Egg Groups'),
 ('Breeding', 'Gender'),
 ('Breeding', 'Egg cycles'),
 ('Base stats', 'HP'),
 ('Base stats', 'Min HP'),
 ('Base stats', 'Max HP'),
 ('Base stats', 'Attack'),
 ('Base stats', 'Min Attack'),
 ('Base stats', 'Max Attack'),
 ('Base stats', 'Defense'),
 ('Base stats', 'Min Defense'),
 ('Base stats', 'Max Defense'),
 ('Base stats', 'Sp. Atk'),
 ('Base stats', 'Min Sp. Atk'),
 ('Base stats', 'Max Sp. Atk'),
 ('Base stats', 'Sp. Def'),
 ('Base stats', 'Min Sp. Def'),
 ('Base stats', 'Max Sp. Def'),
 ('Base stats', 'Speed'),
 ('Base stats', 'Min Speed'),
 ('Base stats', 'Max Speed'),
 ('Base stats', 'Total'),
 ('Ty

creating empty dataframe with headings

In [160]:
import pandas as pd

In [161]:
titles1_add =['Name','Image']
titles2_add = ['','']
titles1_final = titles1_add + titles1
titles2_final = titles2_add + titles2
titles_add = list(zip(titles1_add, titles2_add))
titles_add

[('Name', ''), ('Image', '')]

In [162]:
final_titles = titles_add + titles
final_titles

[('Name', ''),
 ('Image', ''),
 ('Pokédex data', 'National №'),
 ('Pokédex data', 'Type'),
 ('Pokédex data', 'Species'),
 ('Pokédex data', 'Height'),
 ('Pokédex data', 'Weight'),
 ('Pokédex data', 'Abilities'),
 ('Pokédex data', 'Local №'),
 ('Training', 'EV yield'),
 ('Training', 'Catch rate'),
 ('Training', 'Base Friendship'),
 ('Training', 'Base Exp.'),
 ('Training', 'Growth Rate'),
 ('Breeding', 'Egg Groups'),
 ('Breeding', 'Gender'),
 ('Breeding', 'Egg cycles'),
 ('Base stats', 'HP'),
 ('Base stats', 'Min HP'),
 ('Base stats', 'Max HP'),
 ('Base stats', 'Attack'),
 ('Base stats', 'Min Attack'),
 ('Base stats', 'Max Attack'),
 ('Base stats', 'Defense'),
 ('Base stats', 'Min Defense'),
 ('Base stats', 'Max Defense'),
 ('Base stats', 'Sp. Atk'),
 ('Base stats', 'Min Sp. Atk'),
 ('Base stats', 'Max Sp. Atk'),
 ('Base stats', 'Sp. Def'),
 ('Base stats', 'Min Sp. Def'),
 ('Base stats', 'Max Sp. Def'),
 ('Base stats', 'Speed'),
 ('Base stats', 'Min Speed'),
 ('Base stats', 'Max Speed'),


In [163]:
df = pd.DataFrame(columns = pd.MultiIndex.from_tuples(final_titles))
pd.set_option('display.max_columns', None)
df

Unnamed: 0_level_0,Name,Image,Pokédex data,Pokédex data,Pokédex data,Pokédex data,Pokédex data,Pokédex data,Pokédex data,Training,Training,Training,Training,Training,Breeding,Breeding,Breeding,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Base stats,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses,Type defenses
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,Catch rate,Base Friendship,Base Exp.,Growth Rate,Egg Groups,Gender,Egg cycles,HP,Min HP,Max HP,Attack,Min Attack,Max Attack,Defense,Min Defense,Max Defense,Sp. Atk,Min Sp. Atk,Max Sp. Atk,Sp. Def,Min Sp. Def,Max Sp. Def,Speed,Min Speed,Max Speed,Total,Normal,Fire,Water,Electric,Grass,Ice,Fighting,Poison,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy


In [169]:
col_names = ['Name', 'Image']
for title in titles2:
    col_names.extend(title)

In [175]:
df2 = pd.DataFrame(columns=col_names)
df2

Unnamed: 0,Name,Image,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,Catch rate,Base Friendship,Base Exp.,Growth Rate,Egg Groups,Gender,Egg cycles,HP,Min HP,Max HP,Attack,Min Attack,Max Attack,Defense,Min Defense,Max Defense,Sp. Atk,Min Sp. Atk,Max Sp. Atk,Sp. Def,Min Sp. Def,Max Sp. Def,Speed,Min Speed,Max Speed,Total,Normal,Fire,Water,Electric,Grass,Ice,Fighting,Poison,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy


# Create Functions for each data section

In [80]:
#this function will take the soup and returns the names on the tabs for the pokemon
def FindNames(soup):
    names = soup.find(class_='sv-tabs-tab-list').text.strip().split('\n')
    return names

In [81]:
#this function will return the href for the image for the pokemon
def FindImage(soup):
    href = soup.find('a')['href']
    return href

In [72]:
def TableData(table):
    stats = [] 
    for i, data in enumerate(table):
        if '\n\n' not in data.text:
            stats.append(data.text.strip())
    return stats

In [133]:
import re

In [137]:
def TypeTable(table):
    def_stats = [] 
    for i, data in enumerate(table):
        text = str(data)
        substring = "fx-"

        # Find the index where the substring occurs
        start_index = text.find(substring)

        if start_index != -1:
            # Extract the characters after the substring that are numbers
            result = re.findall(r'\d+', text[start_index + len(substring):])
            def_stats.append(result[0])
        else:
            print("Substring not found")
    return def_stats

In [142]:
def_table = temp.find('div', class_='resp-scroll text-center').find_all('td')
def_table
#def_list = def_table.find_all('td')

[<td class="type-fx-cell type-fx-100" title="Normal → Fire/Flying = normal effectiveness"></td>,
 <td class="type-fx-cell type-fx-50" title="Fire → Fire/Flying = not very effective">½</td>,
 <td class="type-fx-cell type-fx-200" title="Water → Fire/Flying = super-effective">2</td>,
 <td class="type-fx-cell type-fx-200" title="Electric → Fire/Flying = super-effective">2</td>,
 <td class="type-fx-cell type-fx-25" title="Grass → Fire/Flying = not very effective">¼</td>,
 <td class="type-fx-cell type-fx-100" title="Ice → Fire/Flying = normal effectiveness"></td>,
 <td class="type-fx-cell type-fx-50" title="Fighting → Fire/Flying = not very effective">½</td>,
 <td class="type-fx-cell type-fx-100" title="Poison → Fire/Flying = normal effectiveness"></td>,
 <td class="type-fx-cell type-fx-0" title="Ground → Fire/Flying = no effect">0</td>,
 <td class="type-fx-cell type-fx-100" title="Flying → Fire/Flying = normal effectiveness"></td>,
 <td class="type-fx-cell type-fx-100" title="Psychic → Fire

In [141]:
test_stats = TypeTable(def_list)
test_stats

['100',
 '50',
 '200',
 '200',
 '25',
 '100',
 '50',
 '100',
 '0',
 '100',
 '100',
 '25',
 '400',
 '100',
 '100',
 '100',
 '50',
 '50']

In [73]:
pokedex = TableData(tables1[0].find_all('td'))
training = TableData(tables1[1].find_all('td'))
breeding = TableData(tables1[2].find_all('td'))
base = TableData(tables1[3].find_all('td'))

In [145]:
charizard = [] 
charizard.append(FindNames(test_soup)[0])
charizard.append(FindImage(temp))
charizard.extend(TableData(tables1[0].find_all('td')))
charizard.extend(TableData(tables1[1].find_all('td')))
charizard.extend(TableData(tables1[2].find_all('td')))
charizard.extend(TableData(tables1[3].find_all('td')))
def_stats = temp.find('div', class_='resp-scroll text-center').find_all('td')
charizard.extend(TypeTable(def_stats))
charizard

['Charizard',
 'https://img.pokemondb.net/artwork/large/charizard.jpg',
 '0006',
 'Fire Flying',
 'Flame Pokémon',
 '1.7\xa0m (5′07″)',
 '90.5\xa0kg (199.5\xa0lbs)',
 '1. BlazeSolar Power (hidden ability)',
 "0006 (Red/Blue/Yellow)0231 (Gold/Silver/Crystal)0006 (FireRed/LeafGreen)0236 (HeartGold/SoulSilver)0085 (X/Y — Central Kalos)0006 (Let's Go Pikachu/Let's Go Eevee)0380 (Sword/Shield)",
 '3 Sp. Atk',
 '45 (5.9% with PokéBall, full HP)',
 '50 (normal)',
 '267',
 'Medium Slow',
 'Dragon, Monster',
 '87.5% male, 12.5% female',
 '20 (4,884–5,140 steps)',
 '78',
 '266',
 '360',
 '84',
 '155',
 '293',
 '78',
 '144',
 '280',
 '109',
 '200',
 '348',
 '85',
 '157',
 '295',
 '100',
 '184',
 '328',
 '534',
 '100',
 '50',
 '200',
 '200',
 '25',
 '100',
 '50',
 '100',
 '0',
 '100',
 '100',
 '25',
 '400',
 '100',
 '100',
 '100',
 '50',
 '50']

In [146]:
len(charizard)

54

In [179]:
df2 = pd.DataFrame(columns=col_names)

In [186]:
df2.loc[0] = charizard
df2

Unnamed: 0,Name,Image,National №,Type,Species,Height,Weight,Abilities,Local №,EV yield,Catch rate,Base Friendship,Base Exp.,Growth Rate,Egg Groups,Gender,Egg cycles,HP,Min HP,Max HP,Attack,Min Attack,Max Attack,Defense,Min Defense,Max Defense,Sp. Atk,Min Sp. Atk,Max Sp. Atk,Sp. Def,Min Sp. Def,Max Sp. Def,Speed,Min Speed,Max Speed,Total,Normal,Fire,Water,Electric,Grass,Ice,Fighting,Poison,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy
0,Charizard,https://img.pokemondb.net/artwork/large/chariz...,6,Fire Flying,Flame Pokémon,1.7 m (5′07″),90.5 kg (199.5 lbs),1. BlazeSolar Power (hidden ability),0006 (Red/Blue/Yellow)0231 (Gold/Silver/Crysta...,3 Sp. Atk,"45 (5.9% with PokéBall, full HP)",50 (normal),267,Medium Slow,"Dragon, Monster","87.5% male, 12.5% female","20 (4,884–5,140 steps)",78,266,360,84,155,293,78,144,280,109,200,348,85,157,295,100,184,328,534,100,50,200,200,25,100,50,100,0,100,100,25,400,100,100,100,50,50


# Scrape Data

In [129]:
#for loop for each pokemon href from main page
#get list of names on tab
#for loop for each tab
    #get data for pokedex
    #get data for training
    #get data for breeding
    #get data for base stats
    #get data for defense types
    #combine data and append on to dataframe

In [28]:
test_page = 'https://pokemondb.net/pokedex/charizard'
test_soup =  BeautifulSoup(requests.get(test_page).text, 'lxml')

In [29]:
names = test_soup.find(class_='sv-tabs-tab-list').text.strip().split('\n')
size = len(names)
names

['Charizard', 'Mega Charizard X', 'Mega Charizard Y']

In [30]:
temp = test_soup.find('div', class_='sv-tabs-panel active')

In [31]:
temp.find('a')['href']

'https://img.pokemondb.net/artwork/large/charizard.jpg'

In [32]:
sibs = temp.find_next_siblings('div',limit = 2)

In [33]:
sibs[0].find('a')['href']

'https://img.pokemondb.net/artwork/large/charizard-mega-x.jpg'

In [34]:
#Finds data for pokedex data, training, breeding, and base stats
tables1 = temp.find_all('table', class_='vitals-table')

Pokedex

In [35]:
pokedex_table = tables1[0].find_all('td')
pokedex_data = []
for i, data in enumerate(pokedex_table):
    print('data:' + '\t' + data.text)
    pokedex_data.append(data.text)

data:	0006
data:	
Fire Flying 
data:	Flame Pokémon
data:	1.7 m (5′07″)
data:	90.5 kg (199.5 lbs)
data:	1. BlazeSolar Power (hidden ability)
data:	0006 (Red/Blue/Yellow)0231 (Gold/Silver/Crystal)0006 (FireRed/LeafGreen)0236 (HeartGold/SoulSilver)0085 (X/Y — Central Kalos)0006 (Let's Go Pikachu/Let's Go Eevee)0380 (Sword/Shield)


Training Table

In [36]:
training_table = tables1[1].find_all('td')

for i, data in enumerate(training_table):
    print('data:' + '\t' + data.text)

data:	
3 Sp. Atk 
data:	
45 (5.9% with PokéBall, full HP)

data:	
50 (normal)

data:	267
data:	Medium Slow


Breeding Table

In [37]:
breeding_table = tables1[2].find_all('td')

for i, data in enumerate(breeding_table):
    print('data:' + '\t' + data.text)

data:	Dragon, Monster
data:	87.5% male, 12.5% female
data:	20 (4,884–5,140 steps)



Base Stats

In [45]:
base_stats_table = tables1[3].find_all('td')
temp_list = []
for i, data in enumerate(base_stats_table):
    #print('data:' + '\t' + data.text)
    if '\n' not in data.text:
        temp_list.append(data.text)
temp_list    

['78',
 '266',
 '360',
 '84',
 '155',
 '293',
 '78',
 '144',
 '280',
 '109',
 '200',
 '348',
 '85',
 '157',
 '295',
 '100',
 '184',
 '328',
 '534']

Type Defenses

In [112]:
def_table = temp.find('div', class_='resp-scroll text-center')
def_table

<div class="resp-scroll text-center"><table class="type-table type-table-pokedex"><tr><th><a class="type-icon type-normal type-cell type-abbr" href="/type/normal" title="Normal">Nor</a></th><th><a class="type-icon type-fire type-cell type-abbr" href="/type/fire" title="Fire">Fir</a></th><th><a class="type-icon type-water type-cell type-abbr" href="/type/water" title="Water">Wat</a></th><th><a class="type-icon type-electric type-cell type-abbr" href="/type/electric" title="Electric">Ele</a></th><th><a class="type-icon type-grass type-cell type-abbr" href="/type/grass" title="Grass">Gra</a></th><th><a class="type-icon type-ice type-cell type-abbr" href="/type/ice" title="Ice">Ice</a></th><th><a class="type-icon type-fighting type-cell type-abbr" href="/type/fighting" title="Fighting">Fig</a></th><th><a class="type-icon type-poison type-cell type-abbr" href="/type/poison" title="Poison">Poi</a></th><th><a class="type-icon type-ground type-cell type-abbr" href="/type/ground" title="Ground"

In [103]:
def_table[0].find_all('td')[0]

<td class="type-fx-cell type-fx-100" title="Normal → Fire/Flying = normal effectiveness"></td>

# Testing stuff

In [78]:
#To split when abilities have hidden 
text = "1. BlazeSolar Power (hidden ability)"

for i in range(len(text) - 1):
    if text[i].islower() and text[i+1].isupper():
        index = i
        break

print("Index:", index)

Index: 7
