In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import sys
from scipy.sparse import csr_matrix
from collections import Counter

folder = "trictrac_database" 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
# loading clean databases
avis_clean = pd.read_csv(f'{folder}/avis_clean.csv', header=None, names=["Game id", "User id", "Game name UI", "Username", "Datetime", "Rating", "Comment title", "Comment body"]).drop_duplicates()
users = pd.read_csv(f'{folder}/users.csv', header=None, names = ["Username", "User id"]).drop_duplicates()
avis_clean["Datetime"] = pd.to_datetime(avis_clean["Datetime"])
jeux_clean = pd.read_csv("database_cleaned/jeux_clean.csv")

In [119]:
print("Number of unique rated games", len(np.unique(avis_clean['Game id'])))
print("Number of unique users that gave a rating", len(np.unique(avis_clean['User id'])))
print("NUmber of unique user in users database", len(np.unique(users['User id'])))

Number of unique rated games 10549
Number of unique users that gave a rating 11201
NUmber of unique user in users database 11201


In [181]:
user_game_score = avis_clean[["User id","Game id", "Rating"]]

# weighting user's score by substracting their average rating, test
user_rate_mean = user_game_score[["User id", "Rating"]].groupby("User id").mean().rename(columns={"Rating":"Average rate"}).reset_index()
data_weight_rate = user_game_score.merge(user_rate_mean, on="User id")
data_weight_rate["Rating"] -= data_weight_rate["Average rate"] 

# normalizing the scores between 0-1 : norma with user's min max
user_min_max = data_weight_rate.groupby("User id").agg({"Rating": ['min','max']}).reset_index()
user_min_max.columns = ["User id", "Min", "Max"] # flattening because of multi index on min and max
data_w_min_max = data_weight_rate.merge(user_min_max , on="User id")

#data_w_min_max["Rating"] = (data_w_min_max["Rating"] - data_w_min_max["Min"])/(data_w_min_max["Max"] - data_w_min_max["Min"])

# to avoid NaN values if min == max, set to 0 if it is the case
data_w_min_max["Rating"] = np.where((data_w_min_max["Max"] - data_w_min_max["Min"]) != 0,
(data_w_min_max["Rating"] - data_w_min_max["Min"])/(data_w_min_max["Max"] - data_w_min_max["Min"]), 0)

In [182]:
# users and games id start with index 0: we can use that to map and create a matrix users x games 
# matrix row : user id, col: game id, score given to game by user 

rows = user_game_score["User id"][:]
cols = user_game_score["Game id"][:]
data = data_w_min_max["Rating"][:]

sparse_matrix = csr_matrix((data, (rows, cols))) # sparse matrix
mat = sparse_matrix.toarray()

print(f"Size using array matrix : {sys.getsizeof(mat)} bytes\nSize using sparse matrix : {sys.getsizeof(sparse_matrix)} bytes")
print("Matrix size : ", sparse_matrix.shape)

Size using array matrix : 945274920 bytes
Size using sparse matrix : 48 bytes
Matrix size :  (11201, 10549)


In [183]:
# verify the matrix has correct values
print(len(user_game_score[user_game_score["User id"] == 523]))
np.diff(sparse_matrix.indptr)[523] # number of non zero values

8


8

In [197]:
print(sparse_matrix[345][:])
avis_clean[avis_clean["User id"]== 345]

  (0, 5159)	1.0
  (0, 5948)	0.0


Unnamed: 0,Game id,User id,Game name UI,Username,Datetime,Rating,Comment title,Comment body
485,5948,345,Love Letter,blochjo6,2021-03-16 14:10:50,1.0,Lettre morte,"Love Letter, un jeu minimaliste, japonais, sim..."
23659,5159,345,Le Dilemme du Roi,blochjo6,2021-06-01 11:33:24,2.0,"Oui. Mais alors, vraiment non",A - Préambule Cette revue ne contient pas de s...
