# Normalizes the matrix of user-item ratings
* The normalizer performs the following steps:
  * subtracts out the baseline predictor
  * computes a confidence for each rating

In [1]:
import os
import pickle
import pandas as pd

In [2]:
os.chdir("../../data/cleaned_data")

In [3]:
df = pickle.load(open("user_anime_lists.pkl", "rb"))

In [4]:
df

Unnamed: 0,username,anime_id,my_score
0,karthiga,21,9
1,karthiga,59,7
2,karthiga,74,7
3,karthiga,120,7
4,karthiga,178,7
...,...,...,...
46358317,mini_kaila,4415,9
46358318,mini_kaila,5940,8
46358319,mini_kaila,6500,8
46358320,mini_kaila,7058,8


In [5]:
# compute the baseline predictor
average_rating = df["my_score"].mean()

user_stats = pd.DataFrame()
user_stats['user_bias'] = df.groupby("username")["my_score"].mean() - average_rating

anime_stats = pd.DataFrame()
anime_stats['anime_bias'] = df.groupby("anime_id")["my_score"].mean() - average_rating

df = df.merge(anime_stats, on=["anime_id"]).merge(user_stats, on=["username"])
df["blp"] = df["anime_bias"] + df["user_bias"] + average_rating
df["normalized_score"] = df["my_score"] - df["blp"]

In [6]:
# estimate the variance in each user-item rating
normalized_user_stats = pd.DataFrame()
normalized_user_stats['normalized_user_var'] = df.groupby("username")["normalized_score"].var()

normalized_anime_stats = pd.DataFrame()
normalized_anime_stats['normalized_anime_var'] = df.groupby("anime_id")["normalized_score"].var()

df = df.merge(normalized_anime_stats, on=["anime_id"]).merge(normalized_user_stats, on=["username"])

# We approximate the variance in user i's rating of item j 
# is 0.25 * Var(user_i) + 0.25 * Var(anime_j). This is a
# crude guess, but it's good enough.
user_contrib = 0.25 
anime_contrib = 0.25
df['normalized_score_var'] = df['normalized_user_var'] * user_contrib + df['normalized_anime_var'] * anime_contrib

In [7]:
norm_df = df[['username', 'anime_id', 'normalized_score', 'normalized_score_var']]
norm_df = norm_df.rename({'normalized_score': 'score', 'normalized_score_var': 'score_var'}, axis=1)
norm_df = norm_df.dropna()

In [8]:
outdir = "../processed_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)
os.chdir(outdir)

In [9]:
user_stats.to_pickle('user_stats.pkl')
anime_stats.to_pickle('anime_stats.pkl')
normalized_anime_stats.to_pickle('normalized_anime_stats.pkl')
norm_df.to_pickle('user_anime_lists.pkl')