In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup as soup
import re

https://medium.com/@raiyanquaium/how-to-web-scrape-using-beautiful-soup-in-python-without-running-into-http-error-403-554875e5abed

## RegExs to invoke for scraping metacritic

In [75]:
find_meta = re.compile(r'metascore_w')
find_critic_rev_num = re.compile(r'\/game\/pc\/.*\/critic-reviews')
find_user_rev_num = re.compile(r'\/game\/pc\/.*\/user-reviews')
find_genre = re.compile(r"Board Games|Trivie|Puzzle|Party \/ Minigame|Massively Multiplayer Online|Sports|Driving|Racing|Open-World|Tactics|MOBA|Real-Time|Historic|Strategy|Turn-Based|4X|Vehicle|Simulation|Virtual|Sandbox|Fantasy|Action RPG|Role-Playing|3D|2D|Rhythm|Survival|Beat-'Em-Up|Fighting|Platformer|Action|Adventure|Sci-Fi|Shooter|First-Person|Arcade")
find_esrb = find_esrb = re.compile('EC|E|E10+|T|M|AO|RP')

## Use the following block to get the list of urls to scrape through.

In [None]:
# urls = []
# for index in range(28):
#     sublist =[]
#     url = 'https://www.metacritic.com/browse/games/release-date/available/pc/metascore?page='+str(index)
#     req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
#     webpage = urlopen(req).read()
#     meta = soup(webpage,"html.parser")
#     for ref in meta.find_all('a',href = find_game):
#         sublist.append('https://www.metacritic.com'+ref.get('href'))
#     urls.extend(sublist[:-27])
# print(urls)

## Saving to csv so I don't have to wait a few minutes for later

In [91]:
meta_url_df = pd.DataFrame(data = {'urls':urls})
meta_url_df.to_csv("meta_url.csv", sep=',',index=False)

In [133]:
urls = pd.read_csv('meta_url.csv')
urls = urls['urls'].tolist()

## Function to scrape a page

In [79]:
def scrape(page):
    '''
    Function to scrape webpage. Takes an input variable of page, converts it to be usable in beautiful soup,
    which is the website to be scraped.Has an output of a list which will have the values of data.
    '''
    data = [] #title, critic_rating, user_rating, num_critic_rev, num_user_rev, genres, esrb_ratings
    #soupify
    url = page
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    page = soup(webpage,"html.parser")
    
    #title
    data.append(page.find('title').text[:-28])
    
    #critic_rating
    try:
        data.append(page.find(itemprop="ratingValue").getText())
    except:
        data.append(0)
    #user_rating
    try:
        data.append(page.find_all(class_=find_meta)[2].getText())
    except:
        data.append(0)
        
    #num_critic_rev
    try:
        data.append(page.find_all(href=find_critic_rev_num)[2].getText().replace("\n","").replace(" ","").replace("Critics",""))
    except:
        data.append(0)
    #num_user_rev
    try:
        data.append(page.find_all(href=find_user_rev_num)[2].getText().replace(' Ratings', ""))
    except:
        data.append(0)
        
    #genres
    genres = []
    for row in page.find_all('span', class_="data"):
        text = row.getText()
        if re.search(find_genre,text):
            genres.append(re.search(find_genre,text).group())
    data.append(list(set(genres)))
    
    #esrb_ratings
    ratings = []
    for row in page.find_all('span', class_="data"):
        text = row.getText()
        if re.search(find_esrb,text):
            ratings.append(re.search(find_esrb,text).group())
    data.append(list(set(ratings)))
    
    return data

In [81]:
# Test the scrape!
print(scrape(urls[0]))

['Half-Life 2', '96', '9.1', '81', '9729', ['Arcade', 'Action', 'Sci-Fi', 'Shooter', 'First-Person'], ['M']]


## Run the following code block to scrape metacritic and format

In [None]:
# metacritic = []
# index = 0
# for row in urls:
#     print(str(index)+row)
#     metacritic.append(scrape(row))
# print("done!")

# SUCCEESS!
Scraped data!

Now time to make a csv so I don't have to wait over a few hours for it to finish.

In [100]:
meta_df = pd.DataFrame.from_records(metacritic, columns = ['title','critic_rating','user_rating','num_critic_rev','num_user_rev','genres','esrb_rating'])
meta_df.to_csv('metacritic.csv')

AssertionError: 7 columns passed, passed data had 5599 columns

In [106]:
meta_df = pd.read_csv('metacritic.csv') # For some reason my csv has an extra column, so let's get rid of that.

In [108]:
meta_df.drop(['Unnamed: 0'], axis=1, inplace= True)
meta_df

KeyError: "['Unnamed: 0'] not found in axis"

## Cross referencing VGChartz

http://www.vgchartz.com/games/games.php?page=2&results=200&name=&console=PC&keyword=&publisher=&genre=&order=TotalShipped&ownership=Both&boxart=Both&banner=Both&showdeleted=&region=All&goty_year=&developer=&direction=DESC&showtotalsales=1&shownasales=0&showpalsales=0&showjapansales=0&showothersales=0&showpublisher=0&showdeveloper=0&showreleasedate=0&showlastupdate=0&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1&alphasort=&showmultiplat=No

http://www.vgchartz.com/games/games.php?name=&keyword=&console=PC&region=All&developer=&publisher=&goty_year=&genre=&boxart=Both&banner=Both&ownership=Both&showmultiplat=No&results=200&order=TotalShipped&showtotalsales=0&showtotalsales=1&showpublisher=0&showvgchartzscore=0&shownasales=0&showdeveloper=0&showcriticscore=0&showpalsales=0&showreleasedate=0&showuserscore=0&showjapansales=0&showlastupdate=0&showothersales=0&showshipped=0&showshipped=1

http://www.vgchartz.com/games/games.php?page=3&results=200&name=&console=PC&keyword=&publisher=&genre=&order=TotalShipped&ownership=Both&boxart=Both&banner=Both&showdeleted=&region=All&goty_year=&developer=&direction=DESC&showtotalsales=1&shownasales=0&showpalsales=0&showjapansales=0&showothersales=0&showpublisher=0&showdeveloper=0&showreleasedate=0&showlastupdate=0&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1&alphasort=&showmultiplat=No


## RegExs to invoke for scraping VGChartz

In [110]:
find_num = re.compile(r'(\d+){1,3}(,(\d+){1,3}){1,6}')
find_url = re.compile(r'http:\/\/www.vgchartz.com\/game\/.*')

In [111]:
vgurls = []
for index in range(56):
    sublist =[]
    url = 'http://www.vgchartz.com/games/games.php?page='+str(index)+'&results=200&name=&console=PC&keyword=&publisher=&genre=&order=TotalShipped&ownership=Both&boxart=Both&banner=Both&showdeleted=&region=All&goty_year=&developer=&direction=DESC&showtotalsales=1&shownasales=0&showpalsales=0&showjapansales=0&showothersales=0&showpublisher=0&showdeveloper=0&showreleasedate=0&showlastupdate=0&showvgchartzscore=0&showcriticscore=0&showuserscore=0&showshipped=1&alphasort=&showmultiplat=No'
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    vg = soup(webpage,"html.parser")
    for ref in vg.find_all('a',href = find_url)[10:]:
        sublist.append(ref.get('href'))
    vgurls.extend(sublist)
print(vgurls)

['http://www.vgchartz.com/game/215988/playerunknowns-battlegrounds/?region=All', 'http://www.vgchartz.com/game/47724/minecraft/?region=All', 'http://www.vgchartz.com/game/33535/garrys-mod/?region=All', 'http://www.vgchartz.com/game/12531/counter-strike-source/?region=All', 'http://www.vgchartz.com/game/43865/portal-2/?region=All', 'http://www.vgchartz.com/game/83197/grand-theft-auto-v/?region=All', 'http://www.vgchartz.com/game/24178/diablo-iii/?region=All', 'http://www.vgchartz.com/game/51349/terraria/?region=All', 'http://www.vgchartz.com/game/7415/rollercoaster-tycoon-3/?region=All', 'http://www.vgchartz.com/game/12414/portal/?region=All', 'http://www.vgchartz.com/game/7214/starcraft/?region=All', 'http://www.vgchartz.com/game/76807/rust/?region=All', 'http://www.vgchartz.com/game/7279/half-life-2/?region=All', 'http://www.vgchartz.com/game/53148/the-binding-of-isaac/?region=All', 'http://www.vgchartz.com/game/7417/guild-wars/?region=All', 'http://www.vgchartz.com/game/6359/myst/?re

## Saving this list to csv to access later

In [112]:
vg_url = pd.DataFrame(data = {'vgurls':vgurls})
vg_url.to_csv("vg_url.csv", sep=',',index=False)

In [113]:
vg_url = pd.read_csv('vg_url.csv')

In [114]:
def scrape_vg(page):
    '''
    Meant to scrape vgchartz website. Takes an argument of page, that is the url, then returns a list of length 2.
    The first index is a string of the website, and the second is how much it shipped up to date.
    '''
    
    data = [] #title, units shipped to date
    if(response.status_code != 200):
        time.sleep(10)# Thanks Nick Sherwin. You da bomb. This helps make sure 
    #convert string url to parsable website through bs4
    vgpage = page
    req = Request(vgpage, headers={'User-Agent': 'Mozilla/5.0'})
    webpage = urlopen(req).read()
    vgpage = soup(webpage,"html.parser")
    
    #add the title of the current video game page being processed to data
    try:
        data.append(vgpage.find('a', class_="white").getText())
    except:
        data.append(np.nan)
    #add the number of units that are shipped and cast as int
    for row in vgpage.find_all('b'):
        text = row.getText()
        if re.search(find_num,text):
            try:
                data.append(int((re.search(find_num,text).group()).replace(',','')))
            except:
                data.append(0)
    return data


In [116]:
# Test the scrape
print(scrape_vg('http://www.vgchartz.com/game/47724/minecraft/?region=All'))

['Minecraft', 31137550]


## Before we actually scrape, let's whittle it down.
Not all the games in both websites will be available. So let's just scrape what overlaps.

In [144]:
meta_vg = []
for meta in urls:
    meta_chunk = meta.rsplit("/")[-1]
    for vg in vgurls:
        vg_chunk = vg.rsplit("/")[-2]
        if meta_chunk == vg_chunk:
            meta_vg.append(vg)
print(meta_vg)

['http://www.vgchartz.com/game/7279/half-life-2/?region=All', 'http://www.vgchartz.com/game/7279/half-life-2/?region=All', 'http://www.vgchartz.com/game/83197/grand-theft-auto-v/?region=All', 'http://www.vgchartz.com/game/83197/grand-theft-auto-v/?region=All', 'http://www.vgchartz.com/game/36981/out-of-the-park-baseball-2007/?region=All', 'http://www.vgchartz.com/game/7278/half-life/?region=All', 'http://www.vgchartz.com/game/7278/half-life/?region=All', 'http://www.vgchartz.com/game/7278/half-life/?region=All', 'http://www.vgchartz.com/game/7278/half-life/?region=All', 'http://www.vgchartz.com/game/7217/bioshock/?region=All', 'http://www.vgchartz.com/game/7217/bioshock/?region=All', 'http://www.vgchartz.com/game/7302/baldurs-gate-ii-shadows-of-amn/?region=All', 'http://www.vgchartz.com/game/43865/portal-2/?region=All', 'http://www.vgchartz.com/game/43865/portal-2/?region=All', 'http://www.vgchartz.com/game/49111/the-elder-scrolls-v-skyrim/?region=All', 'http://www.vgchartz.com/game/28

In [None]:
vg_data = []
index = 0
for row in meta_vg:
    try:
        vg_data.append(scrape_vg(row))
        print(str(index)+": "+row)
        index +=1
    
    except:
        index += 1
        continue

In [None]:
vgurls[0].rsplit("/")[-2]

In [314]:
print(len(meta_vg))

3492
