# Data Collection: List the Best and Worst Games
As we are trying to answer the question of "What makes a game a bestseller?" it seems a   
good place to start would be to look at which games are actually bestsellers.  
First we will scrape some lists of games to create shortlists of the 'best' games  
according to commercial success (revenue) or by overall popularity (the number of players).

In [None]:
#Import necessary libraries
mport requests
from bs4 import BeautifulSoup
from slugify import slugify
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import numpy as np
import getpass
import pickle
import re
from time import sleep
from random import randint 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
pd.set_option('max_rows',None)
pd.set_option('max_columns',None)

# The RAWG dataset
Load the RAWG community list of game data scraped from RAWG's web API.  
The RAWG game dataset contains:
- unique rawg community ids for each game.
- titles
- release date information
- aggregated user ratings from its own community
- aggregated ratings from metacritic (where available)
- average length
- available platforms
- genres
- esrb rating

there are other columns that are not necessary for our purposes like 'tba' and 'genre_ids'
so we can drop them


In [None]:
rawg_data = pd.read_csv('games_data.csv')
rawg_data.columns
rawg_data = rawg_data.drop(['Unnamed: 0','tba','genre_ids'], axis = 1)
rawg_data.shape

In [None]:
# Add a column for overall sentiment.
We will also create a new column to denote whether a given game has an overall positive or negative rating.  
For now, we can think of these as very general sentiment labels for each game.

As we are using the metacritic score as the main denoter of user attitude we will use the scale of 0-100 as the basis for 
classification. we can scale this to a ratio of 1 to 5 by dividing the metacritic scores by 20, and applying the np.ceil() function.
this should give us a scale of the ratings and overall sentiments towards each game ranging from 1 or 'Very negative', to 5 or 'Very Positive'.

In [None]:
# define a function to convert metacritic score to a broad rating classifier.
def metacritic_reception(x):
    score5 = str(np.ceil(x/20))
    sentiment_scale = {'1.0':'Very Negative', '2.0':'Negative','3.0':'Neutral', '4.0':'Positive', '5.0':'Very Positive'}
    return sentiment_scale[score5]

# Test: metacritic_reception(72.977)


In [None]:
# create a column in the rawg data set for these general rating labels
rawg_refined['user reception'] = rawg_refined['metacritic_score'].apply(lambda x: metacritic_reception(x))
rawg_refined.head()

In [None]:
# set up a scraping method to scrape games in the list.
# define function to convert game titles to a searchable format
def name_searchable(x_input):
    name = slugify(name)
    return name

def rawg_reviews(name):
    query = slugify(name) 
    response = requests.get(f'https://rawg.io/games/{query}')
    soup = BeautifulSoup(response.content, 'html.parser')
    reviews = [x.get_text() for x in soup.find_all('div',{'class':'review-card__text'})]
    sleep(randint(1,4))
    return reviews


In [None]:
rawg_refined['reviews'] = [rawg_reviews(x) for x in rawg_refined['title']]

In [None]:
# now, we can sort the rawg game data by metacritic scores.
rawg_refined.sort_values(by='metacritic_score', ascending=False, inplace = True)
with open('rawg_refined_dataset.p','wb') as f:
    pickle.dump(rawg_refined, f)
# remove any values that do not have any associated reviews.
rawg_reviews_data = rawg_refined.reset_index()
for i in range(0,len(rawg_reviews_data)):
    if rawg_reviews_data.reviews[i] == []:
        rawg_reviews_data.drop(i, axis = 0, inplace = True)
#  we can then separate these into the top 200, bottom 200 and create a sample of 200 games within the 3rd quantile of scores.

rawg_top_200 = rawg_reviews_data[:201]
rawg_bottom_200 = rawg_reviews_data[(len(rawg_reviews_data)-200):]
rawg_mid_range = rawg_reviews_data[rawg_reviews_data['metacritic_score'] >= 67]
rawg_mid_range = rawg_reviews_data[rawg_reviews_data['metacritic_score'] <= 81]
rawg_mid_sample = rawg_mid_range.sample(200)
rawg_mid_range.head()

In [None]:
rawg_top_200_file = open('rawg_top_200_text.txt', 'w', encoding = 'utf-8')
for item in rawg_top_200['reviews']:
    if len(item)>0:
        rawg_top_200_file.write("%s\n" % item)

rawg_top_200_file.close()



rawg_bottom_200_file = open('rawg_bottom_200_text.txt', 'w', encoding = 'utf-8')
for item in rawg_bottom_200['reviews']:
    if len(item)>0:
        rawg_bottom_200_file.write("%s\n" % item)

rawg_bottom_200_file.close()



mid_sample_file = open('rawg_mid_sample.txt', 'w', encoding = 'utf-8')
for item in rawg_mid_sample['reviews']:
    if len(item)>0:
        mid_sample_file.write("%s\n" % item)

mid_sample_file.close()

In [None]:
# The STEAM Dataset

In addition to the data we have been scraping from the Steam Store application, I have also
downloaded Nik Davis' dataset of Steam Games (uncleaned) from Kaggle (https://www.kaggle.com/nikdavis/steam-store-games).  
Ideally, I would have used the data I collected on my own, but in the interest of time, this dataset should work well too.   
However, it is worth noting that some of the information might be a outdated as this dataset was uploaded in 2019.

In [None]:
# Nik Davis' Steam apps dataset.
steam_data_backup = pd.read_csv('steam_app_data.csv')

# clean up the dataframe and format to match the rawg data.
steam_data_plan_b = steam_data_backup[['type', 'name', 'steam_appid','release_date','developers','publishers','platforms','metacritic','reviews','genres','recommendations']]
steam_data_plan_b.head()


In [None]:
# Steam app data scraped from Steam Store.
steam1 = pickle.load(open('steam_data_list.p', 'rb'))
steam2 = pickle.load(open('steam_data_list2.p', 'rb'))
steam3 = pickle.load(open('steam_data_list3.p', 'rb'))
steam_data = steam1+steam2+steam3
len(steam_data)

In [None]:
# define function to extract metacritic scores from dict object
import ast

def meta_metascore(x):
    if type(x) != float and type(x) != None:
        x = ast.literal_eval(x)
        x =x['score']
        return x
    else:
        x = np.NaN
        return x

# define a function to extract the release date
def get_release_date(x):
    if type(x) != float and type(x) != None:
        x = eval(x)
        return x['date']
    else:
        x = np.NaN
        return x 

# define a function to extract the platforms 
def get_platform(x):
    if type(x) != float and type(x) != None:
        x = ast.literal_eval(x)
        platforms = []
        for key in x.keys():
            if x[key]:
                platforms.append(key)
        return platforms
    else:
        platforms = []
        platforms.append(np.NaN)
        return platforms

# define a function to extract the genres
def get_genres(x):
    if type(x) != float and type(x) != None:
        x = ast.literal_eval(x)
        genres = []
        for d in x:
            genres.append(d['description'])
        return genres
    else:
        pass 

# define a function to extract the number of user recommendations
def get_recommendations(x):
    if type(x) != float and type(x) != None:
        x = ast.literal_eval(x)
        x =x['total']
        return x
    else:
        x = np.NaN
        return x


In [None]:
# assemble formatting functions as single function.
def steam_cleaner(steam_df):
    steam_df['release_date'] = steam_df['release_date'].apply(lambda x: get_release_date(x))
    steam_df['platforms'] = steam_df['platforms'].apply(lambda x: get_platform(x))
    steam_df['metacritic'] = steam_df['metacritic'].apply(lambda x: meta_metascore(x))
    steam_df['genres'] = steam_df['genres'].apply(lambda x: get_genres(x))
    steam_df['recommendations'] = steam_df['recommendations'].apply(lambda x: get_recommendations(x))
    return steam_df

In [None]:
steam_data_plan_b = steam_cleaner(steam_data_plan_b)

In [None]:
# check NaN values
steam_data_plan_b.isna().sum()
# 26,254 rows do not have associated scores. These games may not be useful for our assessment so we will drop them.
print(steam_data_plan_b.shape)
steam_data_plan_b = steam_data_plan_b.dropna()
steam_data_plan_b.shape


In [None]:
# get steam reviews for the back up data
# # Set up a package to query each of the ids on steam, and scrape reviews for textual analysis.
# import steamreviews

# params = {'language':'english'}
# theGood_Reviews = steamreviews.download_reviews_for_app_id_batch(steam_top_350['steam_appid'])

In [None]:
# params = {'language':'english'}
# theBad_Reviewss = steamreviews.download_reviews_for_app_id_batch(steam_bottom_350['steam_appid'])

In [None]:
# params = {'language':'english'}
# theAlright_Reviewss = steamreviews.download_reviews_for_app_id_batch(steam_middle_350['steam_appid'])