<center><h1>Website Data Scraping</h1><br><h2>Pokemon generation 8 Pokedex</h2></center>

In [23]:
# import the libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [24]:
# This is the webpage we'll be geting the information from
page = requests.get('https://www.serebii.net/swordshield/galarpokedex.shtml')

In [25]:
# create Beautiful Soup Object
# grab all text from webpage using HTML parser from python standard library
soup = BeautifulSoup(page.text, 'html.parser')

In [26]:
# finds all tables and puts them in a list
# the info we want is in the 2nd table or [1]
table = soup.find_all('table')[1]

In [27]:
# <tr> is tag for 'table row' and <td> is tag for standard cell.
# this variable contains the bulk of the info
bulk = table.find_all('tr')[2].find_all('td')

In [28]:
# create a dictionary with empty lists.
# the keys will be our columns in pandas DataFrame.
poke_dic = {'No.':[],'Name':[],'Ability1':[],'Ability2':[],'Hidden_Ability':[],'Type1':[],'Type2':[],'HP':[],'Att':[],'Def':[],'S.Att':[],'S.Def':[],'Spd':[],'Weight_kg':[],'Height_m':[],'Weight_lbs':[],'Height_ft':[],'Cap_Rate':[],'Egg_Steps':[],'Classification':[]}

In [29]:
# A list of all the pokemon types
# will be used to extract typing from a string
poke_types = ['normal','fighting','flying','poison','ground','rock','bug','ghost','steel','fire','water','grass','electric','psychic','ice','dragon','dark','fairy']

In [30]:
print(poke_types)

['normal', 'fighting', 'flying', 'poison', 'ground', 'rock', 'bug', 'ghost', 'steel', 'fire', 'water', 'grass', 'electric', 'psychic', 'ice', 'dragon', 'dark', 'fairy']


In [31]:
print(poke_dic)

{'No.': [], 'Name': [], 'Ability1': [], 'Ability2': [], 'Hidden_Ability': [], 'Type1': [], 'Type2': [], 'HP': [], 'Att': [], 'Def': [], 'S.Att': [], 'S.Def': [], 'Spd': [], 'Weight_kg': [], 'Height_m': [], 'Weight_lbs': [], 'Height_ft': [], 'Cap_Rate': [], 'Egg_Steps': [], 'Classification': []}


In [32]:
# 1st for loop gives each pokemon a unique number 
for i in range(1,401):
    poke_dic['No.'].append(i)

# 2nd loop grabs a pokemon's name. 
# length of bulk is ~ 4000
# every 12 elements in 'bulk' list is a different pokemon's name
for i in range(3,len(bulk),12):
    poke_dic['Name'].append(bulk[i].get_text(strip=True))

# Some pokemon can have up to 2 abilities, while others only 1.
# And most even have a hidden ability.
# All abilities come up as elements in a string. The last one is always the hidden ability.
for i in range(4,len(bulk),12):
    if len(table.find_all('tr')[2].find_all('td')[i].find_all('a')) == 3:
        poke_dic['Hidden_Ability'].append(bulk[i].find_all('a')[2].get_text())
        poke_dic['Ability2'].append(bulk[i].find_all('a')[1].get_text())
        poke_dic['Ability1'].append(bulk[i].find_all('a')[0].get_text())
    elif len(table.find_all('tr')[2].find_all('td')[i].find_all('a')) == 2:
        poke_dic['Hidden_Ability'].append(bulk[i].find_all('a')[1].get_text())
        poke_dic['Ability2'].append('NaN')
        poke_dic['Ability1'].append(bulk[i].find_all('a')[0].get_text())
    else:
        poke_dic['Hidden_Ability'].append('NaN')
        poke_dic['Ability2'].append('NaN')
        poke_dic['Ability1'].append(bulk[i].find_all('a')[0].get_text())

# 4th loop extracts a pokemons type.
# A pokemon can have up to 2 different typings.
# webpage has no text with type's name only and image of the type's name
# However, the type can be extracted from the image's file name
for i in range(5,len(bulk),12):
    if len(bulk[i].find_all('a')) == 2:
        First_type = str(bulk[i].find_all('a')[0]).replace('.gif"','').split('/')
        Sec_type = str(bulk[i].find_all('a')[1]).replace('.gif"','').split('/')
        poke_dic['Type2'].append([j for j in Sec_type if j in poke_types][0])
        poke_dic['Type1'].append([j for j in First_type if j in poke_types][0])
    elif len(bulk[i].find_all('a')) == 1:
        First_type = str(bulk[i].find_all('a')[0]).replace('.gif"','').split('/')
        poke_dic['Type1'].append([j for j in First_type if j in poke_types][0])
        poke_dic['Type2'].append('NaN')

# 5th loop extracts a pokemon's base stats
for i in range(6,len(bulk),12):
    poke_dic['HP'].append(int(bulk[i].get_text()))
    poke_dic['Att'].append(int(bulk[i+1].get_text()))
    poke_dic['Def'].append(int(bulk[i+2].get_text()))
    poke_dic['S.Att'].append(int(bulk[i+3].get_text()))
    poke_dic['S.Def'].append(int(bulk[i+4].get_text()))
    poke_dic['Spd'].append(int(bulk[i+5].get_text()))



In [33]:
# This for loop extracts info on a pokemon's weight,height,class,capture rate, egg steps.
# However this info is not located in the same webpage.
# Each pokemon has this info on their own respective webpage.
for i in range(3,len(bulk),12):
    page_char = requests.get('https://www.serebii.net'+bulk[i].find('a').get('href'))
    soup_char = BeautifulSoup(page_char.text, 'html.parser')
    table_char = soup_char.find_all('table',class_="dextable")[1].find_all(class_="fooinfo")[4:]
    poke_dic['Classification'].append(table_char[0].get_text(strip=True))

# One particular pokemon has their info organized differently than all the rest.
# So Two if statements were used to combat this.
    if len(table_char[2].get_text(strip=True).split('s')) == 2:       
        poke_dic['Weight_lbs'].append(float(table_char[2].get_text(strip=True).split('s')[0].replace('lb','')))
        poke_dic['Weight_kg'].append(float(table_char[2].get_text(strip=True).split('s')[1].replace('kg','')))
    else:
        poke_dic['Weight_lbs'].append(float(table_char[2].get_text(strip=True).split('s')[0].replace('lb','')))
        poke_dic['Weight_kg'].append(float(table_char[2].get_text(strip=True).split('s')[2].split(' / ')[0].replace('kg','')))
  

    if len(table_char[1].get_text(strip=True).split('"')) == 2:       
        feet = table_char[1].get_text(strip=True).split('"')[0].split("'")
        feet[0] = round(float(feet[0]) + float(feet[1])/12 , 3)
        poke_dic['Height_ft'].append(feet[0])
        poke_dic['Height_m'].append(float(table_char[1].get_text(strip=True).split('"')[1].replace('m','')))
    else:
        feet = table_char[1].get_text(strip=True).split('"')[0].split("'")
        feet[0] = round(float(feet[0]) + float(feet[1])/12 , 3)
        poke_dic['Height_ft'].append(feet[0])
        poke_dic['Height_m'].append(float(table_char[1].get_text(strip=True).split('"')[2].split('/')[0].replace('m ','')))
    
    poke_dic['Cap_Rate'].append(float(table_char[3].get_text(strip=True)))
    poke_dic['Egg_Steps'].append(float(table_char[4].get_text(strip=True).replace(',','')))

In [34]:
# Create a pandas DataFrame using the now filled in dictionary 
JPokeDex8 = pd.DataFrame.from_dict(poke_dic)

In [35]:
# However the 'Name' column also has Japanese characters.
# Not sure if I like this or not, so made a copy with only english characters.
JPokeDex8.head()

Unnamed: 0,No.,Name,Ability1,Ability2,Hidden_Ability,Type1,Type2,HP,Att,Def,S.Att,S.Def,Spd,Weight_kg,Height_m,Weight_lbs,Height_ft,Cap_Rate,Egg_Steps,Classification
0,1,Grookeyサルノリ,Overgrow,,Grassy Surge,grass,,50,65,50,40,40,65,5.0,0.3,11.0,1.0,45.0,5120.0,Chimp Pokémon
1,2,Thwackeyバチンキー,Overgrow,,Grassy Surge,grass,,70,85,70,55,60,80,14.0,0.7,30.9,2.333,45.0,5120.0,Beat Pokémon
2,3,Rillaboomゴリランダー,Overgrow,,Grassy Surge,grass,,100,125,90,60,70,85,90.0,2.1,198.4,6.917,45.0,5120.0,Drummer Pokémon
3,4,Scorbunnyヒバニー,Blaze,,Libero,fire,,50,71,40,40,40,69,4.5,0.3,9.9,1.0,45.0,5120.0,Rabbit Pokémon
4,5,Rabootラビフット,Blaze,,Libero,fire,,65,86,60,55,60,94,9.0,0.6,19.8,2.0,45.0,5120.0,Rabbit Pokémon


In [36]:
# import library with english characters
import string
print(string.ascii_letters)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


In [37]:
# Create a funtion to strip Japanese letters from Name column
def in_ascii(word):
    word_split = list(word)
    name = [char for char in word_split if char in list(string.ascii_letters)]
    name = ''.join(name)
    return name

In [38]:
# make a copy of DataFrame and apply the strip function
PokeDex8 = pd.DataFrame.copy(JPokeDex8)

PokeDex8['Name'] = PokeDex8['Name'].apply(in_ascii)

In [39]:
PokeDex8.head()

Unnamed: 0,No.,Name,Ability1,Ability2,Hidden_Ability,Type1,Type2,HP,Att,Def,S.Att,S.Def,Spd,Weight_kg,Height_m,Weight_lbs,Height_ft,Cap_Rate,Egg_Steps,Classification
0,1,Grookey,Overgrow,,Grassy Surge,grass,,50,65,50,40,40,65,5.0,0.3,11.0,1.0,45.0,5120.0,Chimp Pokémon
1,2,Thwackey,Overgrow,,Grassy Surge,grass,,70,85,70,55,60,80,14.0,0.7,30.9,2.333,45.0,5120.0,Beat Pokémon
2,3,Rillaboom,Overgrow,,Grassy Surge,grass,,100,125,90,60,70,85,90.0,2.1,198.4,6.917,45.0,5120.0,Drummer Pokémon
3,4,Scorbunny,Blaze,,Libero,fire,,50,71,40,40,40,69,4.5,0.3,9.9,1.0,45.0,5120.0,Rabbit Pokémon
4,5,Raboot,Blaze,,Libero,fire,,65,86,60,55,60,94,9.0,0.6,19.8,2.0,45.0,5120.0,Rabbit Pokémon


Example uses:<br>
Grab all the Pokemon with the ability 'Flame Body'

In [40]:
PokeDex8[(PokeDex8['Ability1']=='Flame Body') | (PokeDex8['Ability2']=='Flame Body') | (PokeDex8['Hidden_Ability']=='Flame Body')]

Unnamed: 0,No.,Name,Ability1,Ability2,Hidden_Ability,Type1,Type2,HP,Att,Def,S.Att,S.Def,Spd,Weight_kg,Height_m,Weight_lbs,Height_ft,Cap_Rate,Egg_Steps,Classification
158,159,Sizzlipede,Flash Fire,White Smoke,Flame Body,fire,bug,50,65,45,50,50,45,1.0,0.7,2.2,2.333,190.0,5120.0,Radiator Pokémon
159,160,Centiskorch,Flash Fire,White Smoke,Flame Body,fire,bug,100,115,65,90,90,65,120.0,3.0,264.6,9.833,75.0,5120.0,Radiator Pokémon
161,162,Carkol,Steam Engine,Flame Body,Flash Fire,rock,fire,80,60,90,60,70,50,78.0,1.1,172.0,3.583,120.0,3840.0,Coal Pokémon
162,163,Coalossal,Steam Engine,Flame Body,Flash Fire,rock,fire,110,80,120,80,90,30,310.5,2.8,684.5,9.167,45.0,3840.0,Coal Pokémon
286,287,Litwick,Flash Fire,Flame Body,Infiltrator,ghost,fire,50,30,55,65,55,20,3.1,0.3,6.8,1.0,190.0,5120.0,Candle Pokémon
287,288,Lampent,Flash Fire,Flame Body,Infiltrator,ghost,fire,60,40,60,95,60,55,13.0,0.6,28.7,2.0,90.0,5120.0,Lamp Pokémon
288,289,Chandelure,Flash Fire,Flame Body,Infiltrator,ghost,fire,60,55,90,145,90,80,34.3,1.0,75.6,3.25,45.0,5120.0,Luring Pokémon


Create a new csv file in your current directory

In [41]:
# PokeDex8.to_csv('PokeDex8.csv',index=False)
# JPokeDex8.to_csv('JPokeDex8.csv',index=False)