In [1]:
import os

from matplotlib import pyplot as plt
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
SEP       = os.path.sep 
ROOT_PATH = SEP.join(os.getcwd().split(SEP)[:-3])
DATA_PATH = f'{ROOT_PATH}/Dataset/EpicStore'

In [3]:
critic_df = pd.read_csv(f'{DATA_PATH}/open_critic.csv')
critic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17584 entries, 0 to 17583
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          17584 non-null  object 
 1   company     17584 non-null  object 
 2   author      15769 non-null  object 
 3   rating      17053 non-null  float64
 4   comment     17428 non-null  object 
 5   date        17584 non-null  object 
 6   top_critic  17584 non-null  bool   
 7   game_id     17584 non-null  object 
dtypes: bool(1), float64(1), object(6)
memory usage: 978.9+ KB


In [4]:
games_df  = pd.read_csv(f'{DATA_PATH}/games.csv')
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915 entries, 0 to 914
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            915 non-null    object
 1   name          915 non-null    object
 2   game_slug     915 non-null    object
 3   price         915 non-null    int64 
 4   release_date  915 non-null    object
 5   platform      783 non-null    object
 6   description   915 non-null    object
 7   developer     712 non-null    object
 8   publisher     707 non-null    object
 9   genres        757 non-null    object
dtypes: int64(1), object(9)
memory usage: 71.6+ KB


In [5]:
games_df.rename(columns = {'id' : 'game_id'}, inplace = True)
df        = pd.merge(left = games_df, right = critic_df, how = 'right', on = 'game_id')

In [17]:
critic_df['purchase_games'] = [len(critic_df[critic_df.author == author]) for author in critic_df.author.unique()]
critic_df.purchase_games

3809

In [6]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11351 entries, 0 to 17581
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       11351 non-null  object 
 1   name          11351 non-null  object 
 2   game_slug     11351 non-null  object 
 3   price         11351 non-null  int64  
 4   release_date  11351 non-null  object 
 5   platform      11351 non-null  object 
 6   description   11351 non-null  object 
 7   developer     11351 non-null  object 
 8   publisher     11351 non-null  object 
 9   genres        11351 non-null  object 
 10  id            11351 non-null  object 
 11  company       11351 non-null  object 
 12  author        11351 non-null  object 
 13  rating        11351 non-null  float64
 14  comment       11351 non-null  object 
 15  date          11351 non-null  object 
 16  top_critic    11351 non-null  bool   
dtypes: bool(1), float64(1), int64(1), object(14)
memory usage: 1.5+ MB


In [7]:
df.price          = df.price.apply(lambda        x: int(x) / 100)
df.genres         = df.genres.apply(lambda       x: x.split(','))
df.platform       = df.platform.apply(lambda     x: x.split(','))
df.publisher      = df.publisher.apply(lambda    x: x.split(','))
df.release_date   = df.release_date.apply(lambda x: '-'.join(x.split('-')[:2]))
df = df.drop(['game_slug', 'date', 'id'], axis = 1)

In [8]:
df.sample(3).T

Unnamed: 0,15393,15755,2677
game_id,c4920a39dfb74fb2b091fc5eaddcabb1,dfae164155c84664b9040dd802bb2669,bf83aee67dd1475fb6bf2c8563f14b70
name,Neon City Riders: Super-powered Edition,Sherlock Holmes: The Devil's Daughter,Cities: Skylines
price,14.99,29.99,29.99
release_date,2021-08,2021-06,2020-12
platform,[Windows],[Windows],[Windows]
description,Explore a post-cyberpunk decaying city searchi...,Sherlock Holmes: The Devil’s Daughter is a fan...,Cities: Skylines is a modern take on the class...
developer,Mecha Studios,Frogwares,Colossal Order
publisher,[Mecha Studios],[Frogwares],[Paradox Interactive]
genres,"[ACTION, ADVENTURE, OPEN_WORLD]","[ACTION, ADVENTURE, INVESTIGATION]","[STRATEGY, SIMULATION, CITY_BUILDER]"
company,Atomix,Next Gen Base,Press Start


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11351 entries, 0 to 17581
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       11351 non-null  object 
 1   name          11351 non-null  object 
 2   price         11351 non-null  float64
 3   release_date  11351 non-null  object 
 4   platform      11351 non-null  object 
 5   description   11351 non-null  object 
 6   developer     11351 non-null  object 
 7   publisher     11351 non-null  object 
 8   genres        11351 non-null  object 
 9   company       11351 non-null  object 
 10  author        11351 non-null  object 
 11  rating        11351 non-null  float64
 12  comment       11351 non-null  object 
 13  top_critic    11351 non-null  bool   
dtypes: bool(1), float64(2), object(11)
memory usage: 1.2+ MB


In [9]:
user2idx = {user : idx  for idx , user in enumerate(df.author.unique())}
idx2user = {idx  : user for user, idx  in user2idx.items()}

game2idx = {game : idx  for idx , game in enumerate(df.name.unique())}
idx2game = {idx  : game for game, idx  in game2idx.items()}

user_idx = df.author.apply(lambda x: user2idx[x]).values
game_idx = df.name.apply(lambda   x: game2idx[x]).values

In [10]:
n_users  = len(df.author.unique())
n_games  = len(df.name.unique())

zero_mat       = np.zeros(shape = (n_users, n_games))
user_game_pref = zero_mat.copy()
user_game_pref[user_idx, game_idx] = 1

In [12]:
user_game_pref

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])