In [93]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

folder = "trictrac_database" 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [94]:
# loading clean databases
avis_clean = pd.read_csv(f'{folder}/avis_clean.csv', header=None, names=["Game id", "User id", "Game name UI", "Username", "Datetime", "Rating", "Comment title", "Comment body"]).drop_duplicates()
users = pd.read_csv(f'{folder}/users.csv', header=None, names = ["Username", "User id"]).drop_duplicates()
avis_clean["Datetime"] = pd.to_datetime(avis_clean["Datetime"])
jeux_clean = pd.read_csv("database_cleaned/jeux_clean.csv")

In [95]:
print("Number of unique rated games", len(np.unique(avis_clean['Game id'])))
print("Number of unique users that gave a rating", len(np.unique(avis_clean['User id'])))
print("NUmber of unique user in users database", len(np.unique(users['User id'])))

Number of unique rated games 10549
Number of unique users that gave a rating 11201
NUmber of unique user in users database 11201


In [None]:
def center_score(df: pd.DataFrame):
    """
    avis_clean
    The df with at least 3 columns 'User id' 'Game id' 'Rating', center the scores:
    with xi the score of user i, xi = xi - mean(i)

    To avoid biais between scores
    Returns the df with the scores centered
    """
    df = df.copy(deep=True)

    # df with average rate of each user 
    mean_score = df[["User id", "Rating"]].groupby("User id").mean().rename(columns={"Rating":"Average rate"}).reset_index()
    mean_score = df.merge(mean_score, on="User id")
    mean_score['Rating'] -= mean_score['Average rate']

    df['Rating'] = mean_score['Rating']
    return df

def normalize(df: pd.DataFrame):
    """
    avis_clean
    The df with at least 2 columns, 'User id', 'Rating', normalize the scores:
    score xi of user i: xi = (xi - min)/(max - min) with min and max the corresponding values
    for the user i 
    Returns the df with the scores normalized
    """

    df = df.copy(deep=True)
    min_max = df.groupby("User id").agg({"Rating": ['min','max']}).reset_index()
    min_max.columns = ["User id", "Min", "Max"] # no index levels
    min_max = df.merge(min_max, on="User id")

    df['Rating'] = (min_max['Rating'] - min_max["Min"])/(min_max["Max"] - min_max["Min"]) 
    df['Rating'] = df['Rating'].fillna(0) # NaN from division by 0
    
    return df

In [108]:
# we keep users with that has given at least 10 rating
filtd_avis = avis_clean[avis_clean['User id'].map(avis_clean['User id'].value_counts()) >= 10]
print(f"There are {len(np.unique(filtd_avis['User id']))} unique users that have at least 10 ratings")

# we keep the games that have at least 10 ratings, matching games rated by users
filtd_jeux_id = filtd_avis[filtd_avis["Game id"].map(filtd_avis["Game id"].value_counts()) >= 10]["Game id"]
print(f"There are {len(np.unique(filtd_jeux_id))} unique games that have at least 10 ratings")

There are 2253 unique users that have at least 10 ratings
There are 2676 unique games that have at least 10 ratings


In [110]:
user_game_score = avis_clean[["User id","Game id", "Rating", "Username"]]

test = center_score(user_game_score)
test = normalize(user_game_score)
test

Unnamed: 0,User id,Game id,Rating,Username
0,0,6179,0.666667,Monsieur Guillaume
1,1,6179,0.693878,morlockbob
2,2,6179,0.625000,SwatSh
3,3,6179,0.662921,BSI40
4,4,6179,0.736842,Lilly
...,...,...,...,...
142484,2258,10417,0.500000,Abzaron
142485,3138,10417,0.500000,zorglub
142486,160,10417,0.657534,Seb M.
142487,2050,2007,0.600000,sylla


In [92]:
# users and games id start with index 0: we can use that to map and create a matrix users x games 
# matrix row : user id, col: game id, score given to game by user 

rows = user_game_score["User id"][:]
cols = user_game_score["Game id"][:]
data = test["Rating"][:]

sparse_matrix = csr_matrix((data, (rows, cols))) # sparse matrix
