# Scraping Data from Pokédex

In [32]:
"https://pokemondb.net/pokedex/national" #main page for Pokémon Pokédex

'https://pokemondb.net/pokedex/national'

In [33]:
from bs4 import BeautifulSoup
import requests

- get links for each pokemon from the main page
- check if pokemon has other forms (deoxys has attack, defense, speed. Some has mega evolution. Some have Galarian forms
- scrape pokedex data, training, breeding, base stats, type defenses, evolution chart

In [34]:
main_page = 'https://pokemondb.net/pokedex/national'
base_page = 'https://pokemondb.net'

In [35]:
main_soup = BeautifulSoup(requests.get(main_page).text, 'lxml')

In [36]:
#print(main_soup.prettify())

## Collecting all href values of each Pokémon

Links for each pokemon's individual stats under class='ent-name'

In [37]:
href_list = []

In [38]:
for link in main_soup.find_all(class_='ent-name',href=True):
    href_list.append(link['href'])

In [39]:
#Checking that matches with total number of pokemon (1010)
len(href_list)

1010

Using the first pokemon page, get all the headings to data that will be collected to for dataframe titles

In [40]:
pokemon_page = base_page + href_list[0]
pokemon_page

'https://pokemondb.net/pokedex/bulbasaur'

In [41]:
pokemon_soup = BeautifulSoup(requests.get(pokemon_page).text, 'lxml')

In [42]:
headings = pokemon_soup.find_all('h2')
headings

[<h2>Pokédex data</h2>,
 <h2>Training</h2>,
 <h2>Breeding</h2>,
 <h2>Base stats</h2>,
 <h2>Type defenses</h2>,
 <h2>Evolution chart</h2>,
 <h2>Bulbasaur changes</h2>,
 <h2>Pokédex entries</h2>,
 <h2>Moves learned by Bulbasaur</h2>,
 <h2>Bulbasaur sprites</h2>,
 <h2>Where to find Bulbasaur</h2>,
 <h2>Answers to Bulbasaur questions</h2>,
 <h2>Other languages</h2>,
 <h2> </h2>,
 <h2><a href="/etymology">Name origin</a></h2>]

In [43]:
heading_titles = [title.text.strip() for title in headings ]

In [44]:
#Only using 'Pokédex data','Training','Breeding','Base stats','Type defenses','Evolution chart'
heading_titles

['Pokédex data',
 'Training',
 'Breeding',
 'Base stats',
 'Type defenses',
 'Evolution chart',
 'Bulbasaur changes',
 'Pokédex entries',
 'Moves learned by Bulbasaur',
 'Bulbasaur sprites',
 'Where to find Bulbasaur',
 'Answers to Bulbasaur questions',
 'Other languages',
 '',
 'Name origin']

In [45]:
heading_titles = heading_titles[:5]
heading_titles

['Pokédex data', 'Training', 'Breeding', 'Base stats', 'Type defenses']

### Sections below will be heading 2s under the titles just collected

In [46]:
subtitles = []
for i, titles in enumerate(heading_titles):
    target_heading = pokemon_soup.find('h2', text=titles)
    next_sibling = target_heading.find_next()
    #next element names need to be table or div

    while next_sibling.name != 'table' and next_sibling.name != 'div':
        next_sibling = next_sibling.find_next()
        
    tags = next_sibling.find_all('th')
    #if it has a title value, use that instead of just the text
    set_text = []
    for th in tags: 
        a_tag = th.find('a', title=True)
        if a_tag:
            set_text.append(a_tag['title'])
        else:
            set_text.append(th.get_text())
    subtitles.append(set_text)

In [47]:
subtitles

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Attack',
  'Defense',
  'Sp. Atk',
  'Sp. Def',
  'Speed',
  'Total',
  '',
  'Min',
  'Max'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

In [48]:
#removing the min and max at end 
del subtitles[3][7:]
subtitles[3]

['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed', 'Total']

In [49]:
#adding min and max stats after each stat
base_stats_titles = []
for i, title in enumerate(subtitles[3]):
    base_stats_titles.append(title)
    if(i<len(subtitles[3])-1):
        base_stats_titles.append("Min " + title)
        base_stats_titles.append("Max " + title)

In [50]:
subtitles[3]=base_stats_titles
subtitles

[['National №', 'Type', 'Species', 'Height', 'Weight', 'Abilities', 'Local №'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Min HP',
  'Max HP',
  'Attack',
  'Min Attack',
  'Max Attack',
  'Defense',
  'Min Defense',
  'Max Defense',
  'Sp. Atk',
  'Min Sp. Atk',
  'Max Sp. Atk',
  'Sp. Def',
  'Min Sp. Def',
  'Max Sp. Def',
  'Speed',
  'Min Speed',
  'Max Speed',
  'Total'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

In [51]:
#Replacing titles with № to Number since it will be hard to use
final_subtitles = subtitles
final_subtitles[0][0] = 'National Number'
final_subtitles[0][6] = 'Local Number'
final_subtitles

[['National Number',
  'Type',
  'Species',
  'Height',
  'Weight',
  'Abilities',
  'Local Number'],
 ['EV yield', 'Catch rate', 'Base Friendship', 'Base Exp.', 'Growth Rate'],
 ['Egg Groups', 'Gender', 'Egg cycles'],
 ['HP',
  'Min HP',
  'Max HP',
  'Attack',
  'Min Attack',
  'Max Attack',
  'Defense',
  'Min Defense',
  'Max Defense',
  'Sp. Atk',
  'Min Sp. Atk',
  'Max Sp. Atk',
  'Sp. Def',
  'Min Sp. Def',
  'Max Sp. Def',
  'Speed',
  'Min Speed',
  'Max Speed',
  'Total'],
 ['Normal',
  'Fire',
  'Water',
  'Electric',
  'Grass',
  'Ice',
  'Fighting',
  'Poison',
  'Ground',
  'Flying',
  'Psychic',
  'Bug',
  'Rock',
  'Ghost',
  'Dragon',
  'Dark',
  'Steel',
  'Fairy']]

creating empty dataframe with headings

In [52]:
import pandas as pd

In [53]:
col_names = ['Name', 'Image', 'Generation']
for title in final_subtitles:
    col_names.extend(title)

In [54]:
df = pd.DataFrame(columns=col_names)
df

Unnamed: 0,Name,Image,Generation,National Number,Type,Species,Height,Weight,Abilities,Local Number,...,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy


# Create Functions for each data section

In [55]:
#this function will take the soup and returns the names on the tabs for the pokemon
def FindNames(soup):
    names = soup.find(class_='sv-tabs-tab-list').text.strip().split('\n')
    return names

In [56]:
#this function will return the href for the image for the pokemon
def FindImage(soup):
    href = soup.find('img')['src']
    return href

In [57]:
#this function will get the generation of the pokemon 
def FindGen(soup):
    gen = soup.find('abbr').text
    return gen

In [58]:
#This function will return the data from the pokedex, training, breeding, and base stats tables
def TableData(table):
    stats = [] 
    for i, data in enumerate(table):
        if '\n\n' not in data.text:
            stats.append(data.text.strip())
    return stats

In [59]:
import re

In [60]:
#This function will return the type defenses for the pokemon
def TypeTable(table):
    def_stats = [] 
    for i, data in enumerate(table):
        text = str(data)
        substring = "fx-"

        # Find the index where the substring occurs
        start_index = text.find(substring)

        if start_index != -1:
            # Extract the characters after the substring that are numbers
            result = re.findall(r'\d+', text[start_index + len(substring):])
            def_stats.append(result[0])
        else:
            print("Substring not found")
    return def_stats

# Scrape Data

- for loop for each pokemon href from main page
- get list of names on tab
- for loop for each tab
    - get data for pokedex
    - get data for training
    - get data for breeding
    - get data for base stats
    - get data for defense types
    - combine data and append on to dataframe

In [61]:
%%time
for i,href in enumerate(href_list):

    #Make soup 
    page = base_page + href
    soup = BeautifulSoup(requests.get(page).text, 'lxml')
    
    #Get list of names on page tab
    names = FindNames(soup)
    
    image = soup.find('div', class_='sv-tabs-panel')
    
    for j, name in enumerate(names):
        pokemon = []
        pokemon.append(name)
        
        #Get image url 
        if(j==0):
            pokemon.append(FindImage(image))
        else:
            images = []
            next_sib = image.find_next_siblings('div',limit = len(names))
            images = FindImage(next_sib[j-1])
            pokemon.append(images)
        
        #Get generation of pokemon
        generation = FindGen(soup)
        pokemon.append(generation)
        
        #Get pokedex, training, breeding, and base stats data
        tables = soup.find_all('table', class_='vitals-table')
        for k,table in enumerate(tables):
            if k >3:
                break
            pokemon.extend(TableData(table.find_all('td')))
            
        #Get defense types data
        def_stats = soup.find('div', class_='resp-scroll text-center').find_all('td')
        pokemon.extend(TypeTable(def_stats))
        
        #append on to df
        df.loc[len(df)] = pokemon

Wall time: 1min 55s


In [62]:
df

Unnamed: 0,Name,Image,Generation,National Number,Type,Species,Height,Weight,Abilities,Local Number,...,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy
0,Bulbasaur,https://img.pokemondb.net/artwork/bulbasaur.jpg,Generation 1,0001,Grass Poison,Seed Pokémon,0.7 m (2′04″),6.9 kg (15.2 lbs),1. OvergrowChlorophyll (hidden ability),0001 (Red/Blue/Yellow)0226 (Gold/Silver/Crysta...,...,100,200,200,100,100,100,100,100,100,50
1,Ivysaur,https://img.pokemondb.net/artwork/ivysaur.jpg,Generation 1,0002,Grass Poison,Seed Pokémon,1.0 m (3′03″),13.0 kg (28.7 lbs),1. OvergrowChlorophyll (hidden ability),0002 (Red/Blue/Yellow)0227 (Gold/Silver/Crysta...,...,100,200,200,100,100,100,100,100,100,50
2,Venusaur,https://img.pokemondb.net/artwork/venusaur.jpg,Generation 1,0003,Grass Poison,Seed Pokémon,2.0 m (6′07″),100.0 kg (220.5 lbs),1. OvergrowChlorophyll (hidden ability),0003 (Red/Blue/Yellow)0228 (Gold/Silver/Crysta...,...,100,200,200,100,100,100,100,100,100,50
3,Mega Venusaur,https://img.pokemondb.net/artwork/venusaur-meg...,Generation 1,0003,Grass Poison,Seed Pokémon,2.0 m (6′07″),100.0 kg (220.5 lbs),1. OvergrowChlorophyll (hidden ability),0003 (Red/Blue/Yellow)0228 (Gold/Silver/Crysta...,...,100,200,200,100,100,100,100,100,100,50
4,Charmander,https://img.pokemondb.net/artwork/charmander.jpg,Generation 1,0004,Fire,Lizard Pokémon,0.6 m (2′00″),8.5 kg (18.7 lbs),1. BlazeSolar Power (hidden ability),0004 (Red/Blue/Yellow)0229 (Gold/Silver/Crysta...,...,200,100,100,50,200,100,100,100,50,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1189,Iron Valiant,https://img.pokemondb.net/artwork/iron-valiant...,Generation 9,1006,Fairy Fighting,Paradox Pokémon,1.4 m (4′07″),35.0 kg (77.2 lbs),1. Quark Drive,0398 (Scarlet/Violet),...,100,200,200,25,50,100,0,25,200,200
1190,Koraidon,https://img.pokemondb.net/artwork/koraidon.jpg,Generation 9,1007,Fighting Dragon,Paradox Pokémon,2.5 m (8′02″),303.0 kg (668.0 lbs),1. Orichalcum Pulse,0399 (Scarlet/Violet),...,100,200,200,50,50,100,200,50,100,400
1191,Miraidon,https://img.pokemondb.net/artwork/miraidon.jpg,Generation 9,1008,Electric Dragon,Paradox Pokémon,3.5 m (11′06″),240.0 kg (529.1 lbs),1. Hadron Engine,0400 (Scarlet/Violet),...,200,50,100,100,100,100,200,100,50,200
1192,Walking Wake,https://img.pokemondb.net/artwork/walking-wake...,Generation 9,1009,Water Dragon,Paradox Pokémon,3.5 m (11′06″),280.0 kg (617.3 lbs),1. Protosynthesis,—,...,100,100,100,100,100,100,200,100,50,200


# Export Uncleaned Data

In [36]:
df.to_csv('Datasets/Uncleaned_Pokemon_Stats.csv', index=False, encoding='utf-8-sig')

# Scrape Legendary and Mythical List

In [71]:
page = 'https://www.serebii.net/pokemon/legendary.shtml#legend'

In [72]:
soup = BeautifulSoup(requests.get(page).text, 'lxml')

In [109]:
tables = soup.find_all('table', class_='trainer')

In [137]:
legendary=tables[1].find_all('td', valign='top')

In [145]:
legendary[0].find_all('a')[1].text

'Mewtwo'

In [150]:
legendary_list = []
for i in range(0, len(legendary)):
    pokemon_info = legendary[i].find_all('a')
    name = pokemon_info[1].text
    legendary_list.append(name)
legendary_list

['Mewtwo',
 'Lugia',
 'Ho-Oh',
 'Kyogre',
 'Groudon',
 'Rayquaza',
 'Dialga',
 'Palkia',
 'Giratina',
 'Reshiram',
 'Zekrom',
 'Kyurem',
 'Xerneas',
 'Yveltal',
 'Zygarde',
 'Cosmog',
 'Cosmoem',
 'Solgaleo',
 'Lunala',
 'Necrozma',
 'Zacian',
 'Zamazenta',
 'Eternatus',
 'Calyrex',
 'Koraidon',
 'Miraidon']

In [151]:
mythical = tables[2].find_all('td', valign='top')

In [152]:
mythical_list = []
for i in range(0, len(mythical)):
    pokemon_info = mythical[i].find_all('a')
    name = pokemon_info[1].text
    mythical_list.append(name)
mythical_list

['Mew',
 'Celebi',
 'Jirachi',
 'Deoxys',
 'Phione',
 'Manaphy',
 'Darkrai',
 'Shaymin',
 'Arceus',
 'Victini',
 'Keldeo',
 'Meloetta',
 'Genesect',
 'Diancie',
 'Hoopa',
 'Volcanion',
 'Magearna',
 'Marshadow',
 'Zeraora',
 'Meltan',
 'Melmetal',
 'Zarude']

# Create Dataframe

In [154]:
combined_list = [(legendary, 'Legendary') for legendary in legendary_list] + [(mythical, 'Mythical') for mythical in mythical_list]
combined_list

[('Mewtwo', 'Legendary'),
 ('Lugia', 'Legendary'),
 ('Ho-Oh', 'Legendary'),
 ('Kyogre', 'Legendary'),
 ('Groudon', 'Legendary'),
 ('Rayquaza', 'Legendary'),
 ('Dialga', 'Legendary'),
 ('Palkia', 'Legendary'),
 ('Giratina', 'Legendary'),
 ('Reshiram', 'Legendary'),
 ('Zekrom', 'Legendary'),
 ('Kyurem', 'Legendary'),
 ('Xerneas', 'Legendary'),
 ('Yveltal', 'Legendary'),
 ('Zygarde', 'Legendary'),
 ('Cosmog', 'Legendary'),
 ('Cosmoem', 'Legendary'),
 ('Solgaleo', 'Legendary'),
 ('Lunala', 'Legendary'),
 ('Necrozma', 'Legendary'),
 ('Zacian', 'Legendary'),
 ('Zamazenta', 'Legendary'),
 ('Eternatus', 'Legendary'),
 ('Calyrex', 'Legendary'),
 ('Koraidon', 'Legendary'),
 ('Miraidon', 'Legendary'),
 ('Mew', 'Mythical'),
 ('Celebi', 'Mythical'),
 ('Jirachi', 'Mythical'),
 ('Deoxys', 'Mythical'),
 ('Phione', 'Mythical'),
 ('Manaphy', 'Mythical'),
 ('Darkrai', 'Mythical'),
 ('Shaymin', 'Mythical'),
 ('Arceus', 'Mythical'),
 ('Victini', 'Mythical'),
 ('Keldeo', 'Mythical'),
 ('Meloetta', 'Mythical

# Export Dataframe

In [155]:
df2 = pd.DataFrame(combined_list, columns=['Pokemon', 'Legendary/Mythical'])
df2

Unnamed: 0,Pokemon,Legendary/Mythical
0,Mewtwo,Legendary
1,Lugia,Legendary
2,Ho-Oh,Legendary
3,Kyogre,Legendary
4,Groudon,Legendary
5,Rayquaza,Legendary
6,Dialga,Legendary
7,Palkia,Legendary
8,Giratina,Legendary
9,Reshiram,Legendary


In [156]:
df2.to_csv('Datasets/Legendary_Mythical_List.csv', index=False)