# Game urls

In [34]:
# Imports
import requests
from bs4 import BeautifulSoup
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import pandas as pd

headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 RuxitSynthetic/1.0 v316542848 t18859'}

In [35]:
# Get all game urls
categories = ['ps4', 'xboxone', 'switch', 'pc', 'ios', 'wii-u', '3ds', 'vita']

# List to store game urls
game_urls = []

# For each category
for category in categories:
    url = 'https://www.metacritic.com/browse/games/release-date/available/'+category+'/date'
    response = requests.get(url, headers=headers)
    # Pause the loop
    sleep(randint(8,15))
    # Monitor the loop
    print(url)
    clear_output(wait=True)
    
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    # Get the game containers
    game_containers = page_html.find_all('li', class_='product game_product')
    # First and last games have different class names
    game_containers.append(page_html.find('li', class_='product game_product first_product'))
    game_containers.append(page_html.find('li', class_='product game_product last_product'))
    
    for container in game_containers:
        url = container.a['href']
        # Append to list if not exists
        if url not in urls:
            game_urls.append(url)
    
    # Go to the next page
    # If there is a next button
    if page_html.find('span', class_='flipper next'):
        # Get the reference of the next page button
        next_page = page_html.find('span', class_='flipper next').a
        # Loop through the next pages while the button has a reference
        while next_page:
            # Next page url
            next_page_url = next_page['href']
            # Download the page
            response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)
            
            # Pause the loop
            sleep(randint(8,15))
            # Monitor the loop
            print(next_page_url)
            clear_output(wait=True)
            
            # Parse the content of the request with BeautifulSoup
            page_html = BeautifulSoup(response.text, 'html.parser')

            # Get the game containers
            game_containers = page_html.find_all('li', class_='product game_product')
            # First and last games have different class names
            game_containers.append(page_html.find('li', class_='product game_product first_product'))
            game_containers.append(page_html.find('li', class_='product game_product last_product'))

            # Get the game url
            for container in game_containers:
                url = container.a['href']
                # Append to list if not exists
                if url not in urls:
                    game_urls.append(url)
            
            # Get the next page
            # The while loop stops if next_page==None
            next_page = page_html.find('span', class_='flipper next').a

/browse/games/release-date/available/vita/date?page=8


In [37]:
game_urls_df = pd.DataFrame({
    'url':game_urls
})

In [41]:
game_urls_df.shape

(77817, 1)

In [42]:
game_urls_df.to_csv('game_urls.csv', index=False)

# Game details

In [None]:
# Lists to store the scraped data
urls = []
names = []
release_dates = []
game_platforms = []
descriptions = []
developers = []
genres_list = []
online_players_list = []

for game in game_urls:
    # Download the page
    response = requests.get('https://www.metacritic.com'+game+'/details', headers=headers)
    # Pause the loop
    sleep(randint(8,15))
    # Monitor the loop
    print(game)
    clear_output(wait=True)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Save the game url
    # It will be used to assign an id for ratings & reviews
    urls.append(game)

    # The name
    name = page_html.h1.text
    names.append(name)

    # The release date
    date_container = page_html.find('li', class_='summary_detail release_data')
    release_date = date_container.find('span', class_='data').text
    release_dates.append(release_date)

    # The game platforms
    primary_platform = page_html.find('span', class_='platform').text.replace('\n','').strip()
    platform_container = page_html.find('li', class_='summary_detail product_platforms')
    other_platforms = platform_container.find('span', class_='data').text.replace('\n', '').replace(' ','')
    platforms = primary_platform+','+other_platforms
    game_platforms.append(platforms)

    # The description
    summary_container = page_html.find('div', class_='summary_detail product_summary')
    description = summary_container.find('span', class_='data').text
    descriptions.append(description)

    # Developer, genres, and number of online players
    details_container = page_html.find_all('div', class_='product_details')[1]
    details_values = details_container.find_all('td')
    developer = details_values[1].text
    developers.append(developer)
    genres = details_values[2].text.replace('\r\n', '').replace(' ', '')
    genres_list.append(genres)
    online_players = details_values[3].text
    online_players_list.append(online_players)

In [None]:
# Store into a dataframe
games = pd.DataFrame({
    'url' : urls,
    'name' : names,
    'release_date' : release_dates,
    'platforms' : game_platforms,
    'description' : descriptions,
    'developer' : developers,
    'genres' : genres_list,
    'number_online_players' : online_players_list
})

In [None]:
# Save into a csv for further processing if needed
games.to_csv('games.csv', index=False)

# Game ratings & reviews

In [None]:
# Lists to store the scraped data
urls = []
user_names = []
ratings = []
reviews = []
review_dates = []

for game in game_urls:
    # Download the page
    response = requests.get('https://www.metacritic.com'+game+'/user-reviews', headers=headers)
    # Pause the loop
    sleep(randint(8,15))
    # Monitor the loop
    print(game)
    clear_output(wait=True)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Reviews section
    all_reviews = page_html.find('ol', class_='reviews user_reviews')

    # If there is a user reviews section
    if all_reviews:
    # The users
    all_users = all_reviews.find_all('div', class_='name')
    for i in range(len(all_users)):
        user_names.append(all_users[i].text.replace('\n', ''))
        # Save the game url
        urls.append(game)

    # The ratings
    all_ratings = all_reviews.find_all('div', class_='review_grade')
    for i in range(len(all_ratings)):
        rating = all_ratings[i].text.replace('\n', '')
        rating = int(rating)/2
        ratings.append(rating)

    # The reviews
    all_user_reviews = all_reviews.find_all('div', class_='review_body')
    for i in range(len(all_user_reviews)):
        if all_user_reviews[i].span:
            review = all_user_reviews[i].span.text
            reviews.append(review)
        else:
            reviews.append(' ')

    # The review dates
    all_dates = all_reviews.find_all('div', class_='date')
    for i in range(len(all_dates)):
        review_date = all_dates[i].text
        review_dates.append(review_date)

    # Go to the next page
    # If there is a next button
    if page_html.find('span', class_='flipper next'):
        # Get the reference of the next page button
        next_page = page_html.find('span', class_='flipper next').a
        # Loop through the next pages while the button has a reference
        while next_page:
            # Next page url
            next_page_url = next_page['href']
            # Download the page
            response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)

            # Pause the loop
            sleep(randint(8,15))
            # Monitor the loop
            print(next_page_url)
            clear_output(wait=True)

            # Parse the content of the request with BeautifulSoup
            page_html = BeautifulSoup(response.text, 'html.parser')

            # Reviews section
            all_reviews = page_html.find('ol', class_='reviews user_reviews')

            # If there is a user reviews section
            if all_reviews:
                # The users
                all_users = all_reviews.find_all('div', class_='name')
                for i in range(len(all_users)):
                    user_names.append(all_users[i].text.replace('\n', ''))
                    # Save the game url
                    urls.append(game)

                # The ratings
                all_ratings = all_reviews.find_all('div', class_='review_grade')
                for i in range(len(all_ratings)):
                    rating = all_ratings[i].text.replace('\n', '')
                    rating = int(rating)/2
                    ratings.append(rating)

                # The reviews
                all_user_reviews = all_reviews.find_all('div', class_='review_body')
                for i in range(len(all_user_reviews)):
                    if all_user_reviews[i].span:
                        review = all_user_reviews[i].span.text
                        reviews.append(review)
                    else:
                        reviews.append(' ')

                # The review dates
                all_dates = all_reviews.find_all('div', class_='date')
                for i in range(len(all_dates)):
                    review_date = all_dates[i].text
                    review_dates.append(review_date)

            # Get the next page
            # The while loop stops if next_page==None
            next_page = page_html.find('span', class_='flipper next').a

In [None]:
game_ratings = pd.DataFrame({
    'url' : urls,
    'user_name' : user_names,
    'rating' : ratings,
    'review' : reviews,
    'review_date' : review_dates
})

In [None]:
game_ratings.to_csv('game_ratings.csv', index=False)