In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df_games = pd.read_csv("games-data-cleaned.csv")

In [3]:
df_games.head()

Unnamed: 0,name,platform,r-date,score,user score,developer,genre,players,critics,users,year,month,day,dow,score_diff,genre_list
0,The Legend of Zelda: Ocarina of Time,Nintendo64,"November 23, 1998",99,91.0,Nintendo,"actionadventure,fantasy",1,22,5749,1998,11,23,Mon,8.0,"['fantasy', 'actionadventure']"
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",98,74.0,NeversoftEntertainment,"sports,alternative,skateboarding",2,19,647,2000,9,20,Wed,24.0,"['alternative', 'sports', 'skateboarding']"
2,Grand Theft Auto IV,PlayStation3,"April 29, 2008",98,76.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,64,3806,2008,4,29,Tue,22.0,"['modern', 'actionadventure', 'open-world']"
3,SoulCalibur,Dreamcast,"September 8, 1999",98,85.0,Namco,"action,fighting,3d",2,24,324,1999,9,8,Wed,13.0,"['3d', 'fighting', 'action']"
4,Grand Theft Auto IV,Xbox360,"April 29, 2008",98,79.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,86,3364,2008,4,29,Tue,19.0,"['modern', 'actionadventure', 'open-world']"


There is a problem here: the `genre_list` column is now a single string, instead of a list as we wanted from the data cleaning. So, we can quickly redo our process of creating the list.

In [4]:
df_games = df_games.drop('genre_list', axis = 1)

In [5]:
df_games['genre_list'] = df_games['genre'].apply(lambda s: list(set(s.split(','))) )

In [6]:
df_games.head()

Unnamed: 0,name,platform,r-date,score,user score,developer,genre,players,critics,users,year,month,day,dow,score_diff,genre_list
0,The Legend of Zelda: Ocarina of Time,Nintendo64,"November 23, 1998",99,91.0,Nintendo,"actionadventure,fantasy",1,22,5749,1998,11,23,Mon,8.0,"[fantasy, actionadventure]"
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",98,74.0,NeversoftEntertainment,"sports,alternative,skateboarding",2,19,647,2000,9,20,Wed,24.0,"[sports, alternative, skateboarding]"
2,Grand Theft Auto IV,PlayStation3,"April 29, 2008",98,76.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,64,3806,2008,4,29,Tue,22.0,"[modern, actionadventure, open-world]"
3,SoulCalibur,Dreamcast,"September 8, 1999",98,85.0,Namco,"action,fighting,3d",2,24,324,1999,9,8,Wed,13.0,"[3d, fighting, action]"
4,Grand Theft Auto IV,Xbox360,"April 29, 2008",98,79.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,86,3364,2008,4,29,Tue,19.0,"[modern, actionadventure, open-world]"


# Simple Recomender - Users based

We will create a simple recommendation list to give the top games based on a metric similar to the **IMDB TOP 250 Movies** score.

Simply using the score of the game is not a great ideia, given it does not consider the popularity of the game (games with very few but passionate users will have a huge user score, but not necessarily a good critic score).

Let's use the IMDB weighted rating formula as a metric:
$$
\begin{equation}
\text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right)
\end{equation}
$$

Where
* $\rm{v}$ is the number of users;
* $m$ is the minimum users required to be listed in the recommendation list;
* $R$ is the average rating of the game;
* $C$ is the mean score across the whole dataset.

We will begin by creating a dataframe with the necessary columns:

In [7]:
df_sr = df_games[['name','score','user score', 'critics', 'users', 'platform']]
df_sr.head()

Unnamed: 0,name,score,user score,critics,users,platform
0,The Legend of Zelda: Ocarina of Time,99,91.0,22,5749,Nintendo64
1,Tony Hawk's Pro Skater 2,98,74.0,19,647,PlayStation
2,Grand Theft Auto IV,98,76.0,64,3806,PlayStation3
3,SoulCalibur,98,85.0,24,324,Dreamcast
4,Grand Theft Auto IV,98,79.0,86,3364,Xbox360


The value of $C$ can be easily calculated as the `mean` of column **user score**.

In [8]:
C = df_sr['user score'].mean()
print(C)

64.87750780205083


The value of $m$ is a hyperparameter which we can tune later. For now, let's use the cutoff as the 95th percentile. That is, $m$ will be the 0.95 quantile:

In [9]:
m = df_sr['users'].quantile(0.95)
print(m)

659.8499999999985


Now we filter the games bellow the cutoff:

In [10]:
df_cut = df_sr.copy()[df_sr['users'] >= m]

Having our $C$, $m$ and the filtered set, we can write the function to calculate the weigthed rating:

In [11]:
# Function that computes the weighted rating of each game
def w_r(d, m=m, C=C):
    v = d['users']
    R = d['user score']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

Now, we can define a new feature column named **rating** to store the result from our IMDB formula.

In [12]:
# Define a new feature 'rating' and calculate its value with `w_r()`
df_cut['rating'] = df_cut.apply(w_r, axis=1)

Finally, we can sort in descending order by this new **rating** column and check the result from our simple recommender:

In [13]:
df_cut = df_cut.sort_values('rating', ascending=False)

df_cut.head(20)

Unnamed: 0,name,score,user score,critics,users,platform,rating
110,The Witcher 3: Wild Hunt,93,94.0,32,17537,PC,92.943967
2270,Ghost of Tsushima,83,92.0,122,17420,PlayStation4,91.010126
69,God of War,94,92.0,118,16298,PlayStation4,90.944632
215,The Witcher 3: Wild Hunt,92,92.0,79,15749,PlayStation4,90.909322
58,The Last of Us Remastered,95,92.0,70,14563,PlayStation4,90.824348
4565,GrimGrimoire,79,98.0,39,2314,PlayStation2,90.650646
53,The Last of Us,95,92.0,98,11982,PlayStation3,90.584323
10608,Diaries of a Spaceport Janitor,69,97.0,7,2495,PC,90.281447
899,Counter-Strike,88,92.0,11,7112,PC,89.697231
22,Half-Life 2,96,91.0,81,10773,PC,89.492333


So, these are the 20 best games by the IMDB rating. Let's round this rating for a better visual.

In [14]:
df_cut['rating'] = round(df_cut['rating'],2)

In [15]:
df_cut.head(20)

Unnamed: 0,name,score,user score,critics,users,platform,rating
110,The Witcher 3: Wild Hunt,93,94.0,32,17537,PC,92.94
2270,Ghost of Tsushima,83,92.0,122,17420,PlayStation4,91.01
69,God of War,94,92.0,118,16298,PlayStation4,90.94
215,The Witcher 3: Wild Hunt,92,92.0,79,15749,PlayStation4,90.91
58,The Last of Us Remastered,95,92.0,70,14563,PlayStation4,90.82
4565,GrimGrimoire,79,98.0,39,2314,PlayStation2,90.65
53,The Last of Us,95,92.0,98,11982,PlayStation3,90.58
10608,Diaries of a Spaceport Janitor,69,97.0,7,2495,PC,90.28
899,Counter-Strike,88,92.0,11,7112,PC,89.7
22,Half-Life 2,96,91.0,81,10773,PC,89.49


In [16]:
# Saving this df to a csv
# df_cut.to_csv('df-simple-recommender.csv', index=False)

# Simple Recomender - Critics Based

We can repeat the whole process, but using the number of critics reviewing the game.

In [17]:
C = df_sr['score'].mean()
print('C: ',C)

# Let's use the 0.9 quantile for the critics...
m = df_sr['critics'].quantile(0.9)
print('m: ', m)

C:  70.38664734730271
m:  47.0


In [18]:
df_cut_cr = df_sr.copy()[df_sr['critics'] >= m]

In [19]:
# Function that computes the weighted rating of each game
def w_r_cr(d, m=m, C=C):
    v = d['critics']
    R = d['score']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [20]:
# Define a new feature 'rating' and calculate its value with `w_r_cr()`
df_cut_cr['rating'] = df_cut_cr.apply(w_r_cr, axis=1)

df_cut_cr = df_cut_cr.sort_values('rating', ascending=False)

df_cut_cr['rating'] = round(df_cut_cr['rating'],2)

df_cut_cr.head(20)

Unnamed: 0,name,score,user score,critics,users,platform,rating
19,Super Mario Odyssey,97,89.0,113,5546,Switch,89.18
12,The Legend of Zelda: Breath of the Wild,97,86.0,109,15873,Switch,88.98
15,Red Dead Redemption 2,97,84.0,99,14315,PlayStation4,88.43
4,Grand Theft Auto IV,98,79.0,86,3364,Xbox360,88.24
27,Uncharted 2: Among Thieves,96,88.0,105,6716,PlayStation3,88.08
33,Mass Effect 2,96,89.0,98,3503,Xbox360,87.7
6,Super Mario Galaxy 2,97,91.0,87,2521,Wii,87.67
69,God of War,94,92.0,118,16298,PlayStation4,87.27
35,The Elder Scrolls V: Skyrim,96,86.0,89,3954,Xbox360,87.15
25,BioShock,96,89.0,88,2572,Xbox360,87.08


In [21]:
# Saving this df to a csv
# df_cut_cr.to_csv('df-simple-recommender-critics.csv', index=False)

# Content-Based Recommender

In [22]:
df_cb = df_games.copy()

df_cb.head()

Unnamed: 0,name,platform,r-date,score,user score,developer,genre,players,critics,users,year,month,day,dow,score_diff,genre_list
0,The Legend of Zelda: Ocarina of Time,Nintendo64,"November 23, 1998",99,91.0,Nintendo,"actionadventure,fantasy",1,22,5749,1998,11,23,Mon,8.0,"[fantasy, actionadventure]"
1,Tony Hawk's Pro Skater 2,PlayStation,"September 20, 2000",98,74.0,NeversoftEntertainment,"sports,alternative,skateboarding",2,19,647,2000,9,20,Wed,24.0,"[sports, alternative, skateboarding]"
2,Grand Theft Auto IV,PlayStation3,"April 29, 2008",98,76.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,64,3806,2008,4,29,Tue,22.0,"[modern, actionadventure, open-world]"
3,SoulCalibur,Dreamcast,"September 8, 1999",98,85.0,Namco,"action,fighting,3d",2,24,324,1999,9,8,Wed,13.0,"[3d, fighting, action]"
4,Grand Theft Auto IV,Xbox360,"April 29, 2008",98,79.0,RockstarNorth,"actionadventure,modern,modern,open-world",1,86,3364,2008,4,29,Tue,19.0,"[modern, actionadventure, open-world]"


Let's clean some columns for later:

In [23]:
df_cb['platform'] = df_cb['platform'].apply(lambda s: s.lower().replace(' ',''))
df_cb['developer'] = df_cb['developer'].apply(lambda s: s.lower().replace(' ',''))
df_cb.head()

Unnamed: 0,name,platform,r-date,score,user score,developer,genre,players,critics,users,year,month,day,dow,score_diff,genre_list
0,The Legend of Zelda: Ocarina of Time,nintendo64,"November 23, 1998",99,91.0,nintendo,"actionadventure,fantasy",1,22,5749,1998,11,23,Mon,8.0,"[fantasy, actionadventure]"
1,Tony Hawk's Pro Skater 2,playstation,"September 20, 2000",98,74.0,neversoftentertainment,"sports,alternative,skateboarding",2,19,647,2000,9,20,Wed,24.0,"[sports, alternative, skateboarding]"
2,Grand Theft Auto IV,playstation3,"April 29, 2008",98,76.0,rockstarnorth,"actionadventure,modern,modern,open-world",1,64,3806,2008,4,29,Tue,22.0,"[modern, actionadventure, open-world]"
3,SoulCalibur,dreamcast,"September 8, 1999",98,85.0,namco,"action,fighting,3d",2,24,324,1999,9,8,Wed,13.0,"[3d, fighting, action]"
4,Grand Theft Auto IV,xbox360,"April 29, 2008",98,79.0,rockstarnorth,"actionadventure,modern,modern,open-world",1,86,3364,2008,4,29,Tue,19.0,"[modern, actionadventure, open-world]"


As we will use string vectorization later, let's convert year to string:

In [24]:
df_cb['year'] = df_cb['year'].apply(lambda s: str(s) )

Let's modify the players column to include it in the soup bellow:

In [25]:
df_cb['players'] = df_cb['players'].apply(lambda s: str(s)+'players' )

Some games appear in more than one platform, let's rename the ones that do to account for the platform (we want unique names in the **name** column).

In [26]:
df_cb['duplicate'] = df_cb['name'].duplicated(keep=False)

In [27]:
def rename_game(x):
    if x['duplicate']:
        return x['name']+'-'+x['platform']
    else:
        return x['name']   

In [28]:
df_cb['name'] = df_cb.apply(rename_game, axis = 1)

In [29]:
df_cb.head()

Unnamed: 0,name,platform,r-date,score,user score,developer,genre,players,critics,users,year,month,day,dow,score_diff,genre_list,duplicate
0,The Legend of Zelda: Ocarina of Time,nintendo64,"November 23, 1998",99,91.0,nintendo,"actionadventure,fantasy",1players,22,5749,1998,11,23,Mon,8.0,"[fantasy, actionadventure]",False
1,Tony Hawk's Pro Skater 2-playstation,playstation,"September 20, 2000",98,74.0,neversoftentertainment,"sports,alternative,skateboarding",2players,19,647,2000,9,20,Wed,24.0,"[sports, alternative, skateboarding]",True
2,Grand Theft Auto IV-playstation3,playstation3,"April 29, 2008",98,76.0,rockstarnorth,"actionadventure,modern,modern,open-world",1players,64,3806,2008,4,29,Tue,22.0,"[modern, actionadventure, open-world]",True
3,SoulCalibur-dreamcast,dreamcast,"September 8, 1999",98,85.0,namco,"action,fighting,3d",2players,24,324,1999,9,8,Wed,13.0,"[3d, fighting, action]",True
4,Grand Theft Auto IV-xbox360,xbox360,"April 29, 2008",98,79.0,rockstarnorth,"actionadventure,modern,modern,open-world",1players,86,3364,2008,4,29,Tue,19.0,"[modern, actionadventure, open-world]",True


Create a **soup** column merging all content in the features as list columns just created:

In [30]:
def create_soup(x):
    return ' '.join(x['genre_list']) + ' ' + x['platform'] + ' ' + x['developer'] + ' ' + x['year']+ ' ' + x['players']

In [31]:
df_cb['soup'] = df_cb.apply(create_soup, axis = 1)

In [32]:
df_cb['soup'].head(2)

0    fantasy actionadventure nintendo64 nintendo 19...
1    sports alternative skateboarding playstation n...
Name: soup, dtype: object

In [33]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_cb['soup'])

In [34]:
count_matrix.shape

(17944, 4121)

In [35]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [38]:
# Reset index of your main DataFrame and construct reverse mapping
df_cb = df_cb.reset_index()
indices = pd.Series(df_cb.index, index=df_cb['name'])

In [39]:
# Function that takes in game name as input and outputs most similar games
def get_recommendations(name, num_of_recs = 10, cosine_sim=cosine_sim):
    # Get the index of the game that matches the title
    idx = indices[name]

    # Get the pairwsie similarity scores of all games with that games
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar games
    sim_scores = sim_scores[1:num_of_recs+1]

    # Get the game indices
    game_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar games
    return df_cb[['name','platform','year']].iloc[game_indices]

Now, we can use our function to get the top 10 recommendations on a given game.
Let's get recommendations for `Super Smash Bros. Ultimate`:

In [40]:
get_recommendations('Super Smash Bros. Ultimate',10, cosine_sim)

Unnamed: 0,name,platform,year
12702,Kirby Fighters 2,switch,2020
988,Dragon Ball FighterZ-switch,switch,2018
4952,BlazBlue: Cross Tag Battle-switch,switch,2018
7138,Blade Strangers,switch,2018
14552,SNK Heroines: Tag Team Frenzy-switch,switch,2018
7452,Pocket Rumble,switch,2018
842,Super Mario Maker 2,switch,2019
1001,Guacamelee! 2-switch,switch,2018
1453,BlazBlue: Central Fiction - Special Edition,switch,2019
4268,Skullgirls: 2nd Encore-switch,switch,2019


In [41]:
# Saving the df as csv
# df_cb.to_csv('games-recommender.csv',index=False)