In [1]:
import requests
from bs4 import BeautifulSoup
import re

import pandas as pd

# Scraping games info from metacritic

We will use [Metacritic](https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc) data to create a dataframe with data on games across all platforms and all time.

## The dataset

We want to get a dataframe with all games with columns:
* **name**: The name of the game
* **platform**: Platform it was released
* **r-date**: date it was released
* **score**: average score given by critics (metascore)
* **user score**: average score given by users in the website
* **developer**: game developer
* **genre**: genre of the game (can be multiple)
* **players**: Number of players (some games don't have this information)
* **critics**: number of critics reviewing the game
* **users**: Number of metacritic users that reviewed the game

**All data was collected on November 10th, 2020.**

## Steps used in scraper:

* Create a dictionary `pages` that will contain the DataFrame objects from all pages. Each entry is a pandas DataFrame with data from the games in each site page. There are, currently, 180 pages of rated games.
* For each page, create a dictionary `data_page` of empty lists to be filled with the data from each game. As each page displays 100 games, each of this lists should contain 100 elements (except for the last page).
* Use `requests` to get into the url of each page and `BEautifulSoup` to parse the html file.
* Loop through all games in each page and scrap the relevant data. Note that 'developer', 'genre', 'players', 'critics' and 'users' are found on different URLs, so we need to fetch these for each game. This URL for each game is inside a `a` tag with a `title` class. 
* There are a couple of if's in the scraper to ensure None objects get dealt with (some games don't have a number of players information, for example; some games have no user reviews, given it is not yet released, and a few others).
* After all data is collected (and it **takes a few hours** - a bit more than 15 in my laptop), all dataframes in the `pages` dictionary are concatenated to create a single one with all game data.
* The dataframe is export to a csv file.
* I enjoy the awesome new dataset and all I can do with it!


In [None]:
pages = {}

for page in range(180):
    
    data_page = {
        'name':[],
        'platform':[],
        'r-date':[],
        'score':[],
        'user score':[],
        'developer':[],
        'genre':[],
        'players':[],
        'critics':[],
        'users':[]
    }    
    
    # Site inside metacritic listing "Game Releases by Score"
    url = 'https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?page='+str(page)
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Printing out current page
    print(50*'=', "In page: ", page)
    
    # Loop through all games in current page
    for game in soup.find_all('td', class_ = 'clamp-summary-wrap'):
        # Name
        data_page['name'].append(game.find('h3').text)
        
        print(game.find('h3').text, end="\r")
        
        # Platform
        platform = game.find('span', class_='data').text

        # Removing white space
        platform = platform.replace('\n','')
        platform = platform.replace(' ','')

        data_page['platform'].append(platform)
        
        # Release date
        data_page['r-date'].append(game.select('div.clamp-details span')[2].text)
        
        # MetaScore (has different classes depending on score)
        score_list = [
            game.find('div', class_='metascore_w large game positive'),
            game.find('div', class_='metascore_w large game mixed'),
            game.find('div', class_='metascore_w large game negative')
        ]
        
        # Filtering not none element in the score_list
        score = [s.text for s in score_list if s is not None][0]
        
        data_page['score'].append(score)
        
        # User Score (has different classes depending on score)
        score_list = [
            game.find('div', class_='metascore_w user large game positive'),
            game.find('div', class_='metascore_w user large game mixed'),
            game.find('div', class_='metascore_w user large game negative'),
            game.find('div', class_='metascore_w user large game tbd')
        ]
        
        # Filtering not none element in the score_list
        score = [s.text for s in score_list if s is not None][0]
        
        data_page['user score'].append(score)
        
        # Into the game page
        # Getting the url of the reviews page:
        url_info = game.find('a', class_='title')['href']

        url_info = 'https://www.metacritic.com'+url_info

        # Getting into the game page:
        response_info = requests.get(url_info, headers = user_agent)

        soup_info = BeautifulSoup(response_info.text, 'html.parser')

        # Get developer info

        developer = soup_info.find('li', class_ = 'summary_detail developer')
        
        if developer is not None:
            developer = developer.find('span',class_='data').text

            developer = developer.replace('\n','')
            developer = developer.replace(' ','')  

            data_page['developer'].append(developer)
        else:
            data_page['developer'].append('No info')

        # Get genre info (multiple genres are separated by commas in our entry)

        genres = soup_info.find('li', class_ = 'summary_detail product_genre')
        
        if genres is not None:
            genres = genres.find_all('span', class_='data')
            genre=''

            for item in genres:
                if genre:
                    genre = genre + ',' + item.text
                else:
                    genre = item.text

            data_page['genre'].append(genre)
        else:
            data_page['genre'].append('No info')

        # Get number of players

        players = soup_info.find('li', class_ = 'summary_detail product_players')
        
        if players is not None:
            players = players.find('span',class_='data').text
            data_page['players'].append(players)
        else:
            data_page['players'].append('No info')

        # Get number of critics

        critics = soup_info.find('div',class_='score_summary metascore_summary')
        
        if critics is not None:
            critics = critics.find('div',class_='summary').find('a').find('span').text

            if critics is not None:

                critics = critics.replace('\n','')
                critics = critics.replace(' ','')  

                data_page['critics'].append(critics)

            else:
                data_page['critics'].append('0')
        else:
            data_page['critics'].append('0')

        # get number of users

        users = soup_info.find('div',class_='details side_details')
        
        if users is not None:
            users = users.find('div',class_='score_summary')

            if users is not None:
                users = users.find('span',class_='count').find('a')

                if users is not None:
                    users = users.text
                    users = re.sub('\ Ratings$', '', users)
                    data_page['users'].append(users)
                else:
                    data_page['users'].append('0')
            else:
                data_page['users'].append('0')
        else:
            data_page['users'].append('0')
            
            
    # create a dict entry to store the dataframe for each page
    pages[str(page)] = pd.DataFrame(data_page)
    
    # export page data as csv
    pages[str(page)].to_csv('games_data-page'+str(page)+'.csv',index=False)

## Concatenating all dataframes inside 'pages' dictionary

In [None]:
# Create a list of all dataframes to concatenate
frames = []

for k,v in pages.items():
    frames.append(v)

In [None]:
df_ultimate = pd.concat(frames)

Reseting the indexes (not really necessary...)

In [None]:
df_ultimate.index = range(len(df_ultimate))

## Exporting to csv file

In [None]:
df_ultimate.to_csv('games-data.csv',index=False)