# Pokemon Games Sales and Review Web Scraping

In [2]:
# import relevant libraries.
import pandas as pd
from bs4 import BeautifulSoup as bes
import requests

In [3]:
# Gather the HTML code for the website.
url = 'https://en.wikipedia.org/wiki/Pok%C3%A9mon_(video_game_series)'

page = requests.get(url)

soup = bes(page.text, 'html')

In [4]:
soup.find('table')

<table class="infobox hproduct" style="float: right; width: 22em; border-spacing: 2px;"><tbody><tr><th class="infobox-above fn" colspan="2" style="font-size:125%;font-style:italic;">Pokémon</th></tr><tr><td class="infobox-image" colspan="2"><span class="mw-default-size" typeof="mw:File/Frameless"><a class="mw-file-description" href="/wiki/File:International_Pok%C3%A9mon_logo.svg"><img class="mw-file-element" data-file-height="99" data-file-width="269" decoding="async" height="92" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/International_Pok%C3%A9mon_logo.svg/250px-International_Pok%C3%A9mon_logo.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/International_Pok%C3%A9mon_logo.svg/375px-International_Pok%C3%A9mon_logo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/98/International_Pok%C3%A9mon_logo.svg/500px-International_Pok%C3%A9mon_logo.svg.png 2x" width="250"/></a></span></td></tr><tr><th class="infobox-label" scope="row" style="white-

In [5]:
# Grab the HTML code for the table we wish to scrape.
PKMNGamesTable = soup.find('table', class_ = "wikitable plainrowheaders")

In [6]:
# Splicing is done here. This is because, after the columns of the table are shown in the list, 
# the column data for "Game" is then presented.

#Column titles spliced from PKMNGamesTable.
column_titles_html = PKMNGamesTable.find_all('th')[:5]

#Game titles spliced from PKMNGamesTable.
game_column_html = PKMNGamesTable.find_all('th')[5:]

In [7]:
# Obtain a cleaned version of the columns.
column_titles = [title.text.strip() for title in column_titles_html]
game_column = [entry.text.strip() for entry in game_column_html]
print("column_titles = ", column_titles)
print("\n")
print("game_column = ", game_column)

column_titles =  ['Game', 'Year', 'Units sold(in millions)', 'GameRankings', 'Metacritic']


game_column =  ['Pokémon Red and Blue', 'Pokémon Yellow', 'Pokémon Gold and Silver', 'Pokémon Crystal', 'Pokémon Ruby and Sapphire', 'Pokémon FireRed and LeafGreen', 'Pokémon Emerald', 'Pokémon Diamond and Pearl', 'Pokémon Platinum', 'Pokémon HeartGold and SoulSilver', 'Pokémon Black and White', 'Pokémon Black 2 and White 2', 'Pokémon X and Y', 'Pokémon Omega Ruby and Alpha Sapphire', 'Pokémon Sun and Moon', 'Pokémon Ultra Sun and Ultra Moon', "Pokémon: Let's Go, Pikachu! and Let's Go, Eevee!", 'Pokémon Sword and Shield', 'Pokémon Brilliant Diamond and Shining Pearl', 'Pokémon Legends: Arceus', 'Pokémon Scarlet and Violet']


In [8]:
# Generate a new dataframe with column names matching the columns from the scraped table.
df = pd.DataFrame(columns = column_titles)

df

Unnamed: 0,Game,Year,Units sold(in millions),GameRankings,Metacritic


In [9]:
column_data_html = PKMNGamesTable.find_all('tr')

In [10]:
# Add data into pandas dataframe.
for i in range(len(column_data_html[1:])): 
    # Extract the 4 remaining columns of data.
    data = column_data_html[1:][i].find_all('td')
    row_data = [entry.text.strip() for entry in data]
    
    # Add the columns for Game into the first column.
    row_data.insert(0, game_column[i])

    # The length will increase by 1 due to the rows being added to it in df.loc[length] = row_data. This iterates the process. 
    length = len(df)
    df.loc[length] = row_data

In [11]:
# View, uncleaned, extracted dataset!
df

Unnamed: 0,Game,Year,Units sold(in millions),GameRankings,Metacritic
0,Pokémon Red and Blue,1996,31.37[70],88%[71][72],-
1,Pokémon Yellow,1998,14.64[70],85%[73],-
2,Pokémon Gold and Silver,1999,23.73[74],90%[75][76],-
3,Pokémon Crystal,2000,6.39[70],80%[77],-
4,Pokémon Ruby and Sapphire,2002,16.22[78],84%[79][80],82/100[81]
5,Pokémon FireRed and LeafGreen,2004,12[78],81%[82][83],81/100[84][85]
6,Pokémon Emerald,2004,6.32[86],77%[87],76/100[88]
7,Pokémon Diamond and Pearl,2006,17.67[89],85%[90][91],85/100[92][93]
8,Pokémon Platinum,2008,7.06[94],83%[95],84/100[96]
9,Pokémon HeartGold and SoulSilver,2009,12.72[89],88%[97][98],87/100[99][100]


In [12]:
# Convert to CSV for cleaning and analysis.
df.to_csv('/Users/adnanhussain/Documents/PokemonGamesData.csv', index=False) 