In [34]:
import numpy as np
import pandas as pd
import pickle as pkl

import random
import requests
import re
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

### Set up ChromeDriver for Selenium

In [35]:
chromedriver = "C:\\Users\\Daniela\\chromedriver\\chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver

We want to get the links for games directly to save the trouble of having Selenium click on links later on.

In [36]:
# Create an empty list to collect game links.
game_links = []

In [38]:
# The range is just the number of pages showed on Metacritic.

for i in range(157):
    url = f'http://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page={i}'
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    tags = soup.find_all('div', attrs={'class':'product_item product_title'})
    
    for tag in tags:
        link = tag.find('a')['href']
        game_links.append(link)
    
    print(f'Finished retrieving links on page number {i}')

Finished retrieving links on page number 0
Finished retrieving links on page number 1
Finished retrieving links on page number 2
Finished retrieving links on page number 3
Finished retrieving links on page number 4
Finished retrieving links on page number 5
Finished retrieving links on page number 6
Finished retrieving links on page number 7
Finished retrieving links on page number 8
Finished retrieving links on page number 9
Finished retrieving links on page number 10
Finished retrieving links on page number 11
Finished retrieving links on page number 12
Finished retrieving links on page number 13
Finished retrieving links on page number 14
Finished retrieving links on page number 15
Finished retrieving links on page number 16
Finished retrieving links on page number 17
Finished retrieving links on page number 18
Finished retrieving links on page number 19
Finished retrieving links on page number 20
Finished retrieving links on page number 21
Finished retrieving links on page number 2

For each game (using the game links we gathered), gather the following information:

- Game Title
- Release Year
- Publisher
- Genre
- Platform
- Metascore (aka. Review score for game)
- Average User Score
- Number of Players (that can play the game)

In [39]:
# First, since all the links obtained are part of an URL. Need to fix that.

full_game_links = ['https://www.metacritic.com' + x for x in game_links]

In [40]:
title = []
release_year = []
publisher = []
genre = []
platform = []
metascore = []
avg_userscore = []
no_players = []

In [41]:
# Set up scraper

link_count = 0 #to keep track of how many links the scraper has gone through

for link in full_game_links:
    driver = webdriver.Chrome(chromedriver)
    driver.implicitly_wait(3) #add this so we don't need to wait for all webpage elements to load.
    driver.get(link)

    link_count += 1
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # get game title
    if soup.find('h1') == None:
        title.append('page not found')
    else:
        title.append(soup.find('h1').text)

    # get release year of game
    if soup.find('span', attrs={'class':'data', 'itemprop':'datePublished'}) == None:
        release_year.append('not specified')
    else:
        release_year.append(int(soup.find('span', attrs={'class':'data', 'itemprop':'datePublished'}).text[-4:]))

    # get publisher/developer of game
    if soup.find('li', attrs={'class':'summary_detail developer'}) == None:
        publisher.append('not specified')
    else:
        publisher.append(soup.find('li', attrs={'class':'summary_detail developer'}).text.replace('Developer:','').replace('\n','').replace(' ',''))

    # get genre(s) of game
    if len(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})) == 1:
        genre.append(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'}).text)
    elif len(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})) == 0:
        genre.append('no genre')
    else:    
        multi_genres = ';'.join([genre.text for genre in soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})])
        genre.append(multi_genres)

    # get platform of game
    if soup.find('span', attrs={'itemprop':'device'}) == None:
        platform.append('not specified')
    else:
        platform.append(soup.find('span', attrs={'itemprop':'device'}).text.replace('\n','').replace(' ',''))

    # get metascore of game
    if soup.find('span', attrs={'itemprop':'ratingValue'}) == None:
        metascore.append('not specified')
    else:
        metascore.append(soup.find('span', attrs={'itemprop':'ratingValue'}).text)

    # get average userscore of game
    if soup.find('div', attrs={'class':'userscore_wrap feature_userscore'}) == None:
        avg_userscore.append('not specified')
    else:
        avg_userscore.append(soup.find('div', attrs={'class':'userscore_wrap feature_userscore'}).text.replace('\nUser Score\n\n','')[0:3])

    # get the number of players that can play the game
    if soup.find('li', attrs={'class':'summary_detail product_players'}) == None:
        no_players.append('not specified')
    else:
        no_players.append(soup.find('li', attrs={'class':'summary_detail product_players'}).text.replace('\n# of players:\n','').replace('\n',''))

    print(f'Finished gathering Link # {link_count}, title: {len(title)}, year: {len(release_year)}, publisher: {len(publisher)}')
    print(f'genre: {len(genre)}, platform: {len(platform)}, metascore: {len(metascore)}, userscore: {len(avg_userscore)}, no_players: {len(no_players)}')
    
    driver.quit()

Finished gathering Link # 1, title: 1, year: 1, publisher: 1
genre: 1, platform: 1, metascore: 1, userscore: 1, no_players: 1
Finished gathering Link # 2, title: 2, year: 2, publisher: 2
genre: 2, platform: 2, metascore: 2, userscore: 2, no_players: 2
Finished gathering Link # 3, title: 3, year: 3, publisher: 3
genre: 3, platform: 3, metascore: 3, userscore: 3, no_players: 3
Finished gathering Link # 4, title: 4, year: 4, publisher: 4
genre: 4, platform: 4, metascore: 4, userscore: 4, no_players: 4
Finished gathering Link # 5, title: 5, year: 5, publisher: 5
genre: 5, platform: 5, metascore: 5, userscore: 5, no_players: 5
Finished gathering Link # 6, title: 6, year: 6, publisher: 6
genre: 6, platform: 6, metascore: 6, userscore: 6, no_players: 6
Finished gathering Link # 7, title: 7, year: 7, publisher: 7
genre: 7, platform: 7, metascore: 7, userscore: 7, no_players: 7
Finished gathering Link # 8, title: 8, year: 8, publisher: 8
genre: 8, platform: 8, metascore: 8, userscore: 8, no_pla

Finished gathering Link # 63, title: 63, year: 63, publisher: 63
genre: 63, platform: 63, metascore: 63, userscore: 63, no_players: 63
Finished gathering Link # 64, title: 64, year: 64, publisher: 64
genre: 64, platform: 64, metascore: 64, userscore: 64, no_players: 64
Finished gathering Link # 65, title: 65, year: 65, publisher: 65
genre: 65, platform: 65, metascore: 65, userscore: 65, no_players: 65
Finished gathering Link # 66, title: 66, year: 66, publisher: 66
genre: 66, platform: 66, metascore: 66, userscore: 66, no_players: 66
Finished gathering Link # 67, title: 67, year: 67, publisher: 67
genre: 67, platform: 67, metascore: 67, userscore: 67, no_players: 67
Finished gathering Link # 68, title: 68, year: 68, publisher: 68
genre: 68, platform: 68, metascore: 68, userscore: 68, no_players: 68
Finished gathering Link # 69, title: 69, year: 69, publisher: 69
genre: 69, platform: 69, metascore: 69, userscore: 69, no_players: 69
Finished gathering Link # 70, title: 70, year: 70, publ

Finished gathering Link # 123, title: 123, year: 123, publisher: 123
genre: 123, platform: 123, metascore: 123, userscore: 123, no_players: 123
Finished gathering Link # 124, title: 124, year: 124, publisher: 124
genre: 124, platform: 124, metascore: 124, userscore: 124, no_players: 124
Finished gathering Link # 125, title: 125, year: 125, publisher: 125
genre: 125, platform: 125, metascore: 125, userscore: 125, no_players: 125
Finished gathering Link # 126, title: 126, year: 126, publisher: 126
genre: 126, platform: 126, metascore: 126, userscore: 126, no_players: 126
Finished gathering Link # 127, title: 127, year: 127, publisher: 127
genre: 127, platform: 127, metascore: 127, userscore: 127, no_players: 127
Finished gathering Link # 128, title: 128, year: 128, publisher: 128
genre: 128, platform: 128, metascore: 128, userscore: 128, no_players: 128
Finished gathering Link # 129, title: 129, year: 129, publisher: 129
genre: 129, platform: 129, metascore: 129, userscore: 129, no_playe

WebDriverException: Message: chrome not reachable
  (Session info: chrome=69.0.3497.100)
  (Driver info: chromedriver=2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5),platform=Windows NT 10.0.17134 x86_64)


In [42]:
# Put all information collected in lists into a DataFrame.

df = pd.DataFrame(columns=['Title','Year','Publisher','Genre','Platform','Metascore','Avg_Userscore','No_Players'])
df['Title'] = title
df['Year'] = release_year
df['Publisher'] = publisher
df['Genre'] = genre
df['Platform'] = platform
df['Metascore'] = metascore
df['Avg_Userscore'] = avg_userscore
df['No_Players'] = no_players

In [43]:
# An quick shot of what the DataFrame should look like
df.head()

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,not specified,Nintendo,no genre,not specified,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,not specified,NeversoftEntertainment,no genre,not specified,98,7.4,1-2
2,Grand Theft Auto IV,not specified,RockstarNorth,no genre,not specified,98,7.5,1 Player
3,SoulCalibur,not specified,Namco,no genre,not specified,98,8.7,1-2
4,Grand Theft Auto IV,not specified,RockstarNorth,no genre,not specified,98,7.9,1 Player


In [44]:
# Save all data with Pickle.

with open('final_game_general_data.pkl', 'wb') as picklefile:
    pkl.dump(df, picklefile)

The code below is another scraper for gathering user comments for each game scraped.

In [45]:
# All the game links that could not be accessed or found; do not need these.

df[df.Year == 'not specified']

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,not specified,Nintendo,no genre,not specified,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,not specified,NeversoftEntertainment,no genre,not specified,98,7.4,1-2
2,Grand Theft Auto IV,not specified,RockstarNorth,no genre,not specified,98,7.5,1 Player
3,SoulCalibur,not specified,Namco,no genre,not specified,98,8.7,1-2
4,Grand Theft Auto IV,not specified,RockstarNorth,no genre,not specified,98,7.9,1 Player
5,Super Mario Galaxy,not specified,Nintendo,no genre,not specified,97,9.0,No Online Multiplayer
6,Super Mario Galaxy 2,not specified,NintendoEADTokyo,no genre,not specified,97,9.1,No Online Multiplayer
7,Grand Theft Auto V,not specified,RockstarNorth,no genre,not specified,97,7.8,Up to 30
8,Grand Theft Auto V,not specified,RockstarNorth,no genre,not specified,97,8.3,Up to 16
9,Grand Theft Auto V,not specified,RockstarNorth,no genre,not specified,97,8.3,Up to 16


In [46]:
'''
Metacritic is a well-structured website, so we can modify all game links to guide the scraper directly to the
user reviews page, without having the scraper to click through links.
'''

full_game_user_links = [x + '/user-reviews' for x in full_game_links]

In [47]:
game_title = []
game_platform = []
usernames = []
userscores = []
comments = []
no_helpfulness = []

In [48]:
game_count = 0

for index in range(0,4000): # set the number of games you would like the scraper to scrape
    
    game_count += 1
    page_number = 1
    
    driver = webdriver.Chrome(chromedriver)
    driver.implicitly_wait(3)
    driver.get(df['user_reviews_link'][index])

    print(f"Starting to gather comments for {df['Title'][index]}... page {page_number}")
    
    for _ in range(100): #each review page shows a maximum of 100 user reviews; do this to expand longer reviews
        try:
            expand = driver.find_element_by_link_text('Expand')
            expand.click()
        except WebDriverException or NoSuchElementException:
            break
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # get username
    for user in soup.find_all('div', attrs={'class':'name'}):
        usernames.append(user.text)
        
    # get user scores
    for grade in soup.find_all('div', attrs={'class':'review_grade'}):
        userscores.append(int(grade.text))
        
    # get comments (need to expand comments)
    for review in soup.find_all('div', attrs={'class':'review_body'}):
        comments.append(review.text.replace('… Collapse', ''))
        game_title.append(df['Title'][index])
        game_platform.append(df['Platform'][index])
    
    # get how helpful a review is
    for vote in soup.find_all('span', attrs={'class':'total_ups'}):
        no_helpfulness.append(vote.text)
   
    for _ in range(100):
        try:
            page_number += 1
            print(f'starting page {page_number}...')
            next_page = driver.find_element_by_link_text('next')
            next_page.click()

            for _ in range(100):
                try:
                    expand = driver.find_element_by_link_text('Expand')
                    expand.click()
                except WebDriverException or NoSuchElementException:
                    break
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # get usernames
            for user in soup.find_all('div', attrs={'class':'name'}):
                usernames.append(user.text)

            # get user scores
            for grade in soup.find_all('div', attrs={'class':'review_grade'}):
                userscores.append(int(grade.text))

            # get comments (need to expand comments)
            for review in soup.find_all('div', attrs={'class':'review_body'}):
                comments.append(review.text.replace('… Collapse', ''))
                game_title.append(df['Title'][index])
                game_platform.append(df['Platform'][index])

            # get how helpful a review is
            for vote in soup.find_all('span', attrs={'class':'total_ups'}):
                no_helpfulness.append(vote.text)
                
        except NoSuchElementException:
            print(f"Finished **{df['Title'][index]}** game # {game_count} - titles: {len(game_title)}; platforms: {len(game_platform)} users: {len(usernames)}; scores: {len(userscores)}")
            print(f"comments: {len(comments)}; helpful rec'd: {len(no_helpfulness)}")
            break
            
    driver.quit()

KeyError: 'user_reviews_link'

In [None]:
# Save all lists as Pickled files.

with open('actual_game_title_3420.pkl','wb') as titlefile:
    pkl.dump(game_title, titlefile)
    
with open('actual_game_platform_3420.pkl','wb') as platformfile:
    pkl.dump(game_platform, platformfile)
    
with open('actual_usernames_3420.pkl','wb') as userfile:
    pkl.dump(usernames, userfile)
    
with open('actual_userscores_3420.pkl','wb') as scorefile:
    pkl.dump(userscores, scorefile)
    
with open('actual_usercomments_3420.pkl','wb') as commentfile:
    pkl.dump(comments, commentfile)
    
with open('actual_helpfulness_3420.pkl','wb') as helpfulfile:
    pkl.dump(no_helpfulness, helpfulfile)