In [1]:
import random

import numpy as np
import pandas as pd

In [2]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title"]]

In [3]:
df = pd.read_csv("UserAnimeList.csv")

In [4]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [5]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [6]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [7]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)

In [8]:
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)

In [9]:
filtered_df = filtered_df.set_index("anime_id")

In [10]:
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,username,my_score,anime_bias,user_bias,normalized_score
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21,karthiga,9,0.960559,-0.059903,0.605479
59,karthiga,7,0.040198,-0.059903,-0.474160
74,karthiga,7,0.316277,-0.059903,-0.750239
120,karthiga,7,0.309853,-0.059903,-0.743815
178,karthiga,7,-0.227344,-0.059903,-0.206618
...,...,...,...,...,...
10040,temptemptemp,6,-1.636723,-1.493866,1.636723
12963,cinnamoroller,10,-0.798866,2.506134,0.798866
5143,inactiveX,7,-0.652956,-0.493866,0.652956
5581,omgm,5,-1.857502,-2.493866,1.857502


In [12]:
# recommendees = list(filtered_df.loc[lambda x: x['username'] == 'Fro116'].sort_values(by='normalized_score').tail(
#    100).index)
recommendees = [290, 601, 849, 5040, 12467]
# recommendees = [1177, 18679, 114, 398, 34280, 5040]

In [13]:
item_subset = (
    filtered_df.loc[recommendees]
    .reset_index()
    .merge(filtered_df.reset_index(), on="username")
)

In [14]:
item_subset

Unnamed: 0,anime_id_x,username,my_score_x,anime_bias_x,user_bias_x,normalized_score_x,anime_id_y,my_score_y,anime_bias_y,user_bias_y,normalized_score_y
0,290,Xinil,8,0.307017,-0.120677,0.319794,21,9,0.960559,-0.120677,0.666253
1,290,Xinil,8,0.307017,-0.120677,0.319794,59,9,0.040198,-0.120677,1.586614
2,290,Xinil,8,0.307017,-0.120677,0.319794,120,9,0.309853,-0.120677,1.316959
3,290,Xinil,8,0.307017,-0.120677,0.319794,210,7,0.219310,-0.120677,-0.592499
4,290,Xinil,8,0.307017,-0.120677,0.319794,249,7,0.139141,-0.120677,-0.512329
...,...,...,...,...,...,...,...,...,...,...,...
52160349,12467,DMH-kun,10,-0.268291,2.006134,0.768291,9790,10,0.349173,2.006134,0.150827
52160350,12467,DMH-kun,10,-0.268291,2.006134,0.768291,12467,10,-0.268291,2.006134,0.768291
52160351,12467,DMH-kun,10,-0.268291,2.006134,0.768291,13093,8,-0.287588,2.006134,-1.212412
52160352,12467,Gear_Steiner,7,-0.268291,-0.493866,0.268291,12467,7,-0.268291,-0.493866,0.268291


In [15]:
adj_cos_corr_numerator = item_subset.groupby(["anime_id_x", "anime_id_y"]).apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("anime_id").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
x_length = adj_cos_corr_denom[recommendees]
x_length.index.rename("anime_id_x", inplace=True)
y_length = adj_cos_corr_denom
y_length.index.rename("anime_id_y", inplace=True)
adj_cos_corr = adj_cos_corr_numerator / x_length / y_length

In [16]:
adj_cos_corr = pd.DataFrame(adj_cos_corr, columns=["corr"]).dropna()

In [17]:
raw_corrs = adj_cos_corr

In [18]:
corr_sizes = item_subset.groupby(["anime_id_x", "anime_id_y"]).size()
corrs = raw_corrs.merge(
    pd.DataFrame(corr_sizes, columns=["Size"]), on=["anime_id_x", "anime_id_y"]
)
corrs["similarity"] = corrs["corr"].abs()
corrs = corrs.loc[lambda x: x["Size"] >= 10]
corrs = corrs.sort_values(by="similarity").dropna()
corrs = corrs.drop(
    [(x, x) for x in recommendees]
)  # Technically not needed because its a noop for new series, but its useful for debugging

In [19]:
corrs

Unnamed: 0_level_0,Unnamed: 1_level_0,corr,Size,similarity
anime_id_x,anime_id_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12467,8064,-3.242531e-07,162,3.242531e-07
601,15195,-5.050632e-07,544,5.050632e-07
12467,6927,-6.038388e-07,906,6.038388e-07
5040,233,8.232250e-07,423,8.232250e-07
12467,419,-1.081043e-06,1331,1.081043e-06
12467,...,...,...,...
12467,13093,3.079415e-01,5227,3.079415e-01
290,1124,3.662465e-01,1538,3.662465e-01
290,398,4.671594e-01,2847,4.671594e-01
290,397,5.302847e-01,3428,5.302847e-01


In [20]:
len(corrs.index.get_level_values("anime_id_x").unique())

5

In [21]:
# corr_sizes = item_subset.groupby('anime_id').size()
# corrs = raw_corrs.merge(pd.DataFrame(corr_sizes, columns = ['Size']), on='anime_id')
# if use_negative_corr:
#    corrs['similarity'] = corrs['corr'].abs()
# else:
#    corrs['similarity'] = corrs['corr']
# corrs = corrs.loc[lambda x: x['Size'] >= 10]
# corrs = corrs.sort_values(by='similarity').dropna()
# corrs = corrs.drop(recommendee) # Technically not needed because its a noop for new series, but its useful for debugging

In [22]:
# adj_cos_corr_numerator = item_subset.groupby('anime_id_x').apply(
#    lambda x: np.dot(x['normalized_score_x'], x['normalized_score_y']))
# adj_cos_corr_denom = filtered_df.groupby('anime_id_x').apply(
#    lambda x: np.sqrt(np.dot(x['normalized_score'], x['normalized_score'])))
# adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
# adj_cos_corrs = pd.DataFrame((adj_cos_corr_numerator / adj_cos_corr_denom), columns=['corr'])
# adj_cos_corrs = adj_cos_corrs.dropna()

In [23]:
# item_subset = filtered_df.loc[recommendees].merge(filtered_df.reset_index(), on = 'username')

In [24]:
# pearson_corrs = item_subset.groupby('anime_id').apply(lambda x: x['normalized_score_x'].corr(
#    x['normalized_score_y'], min_periods=10))
# pearson_corrs = pd.DataFrame(pearson_corrs, columns=['corr'])
# pearson_corrs = pearson_corrs.dropna()

In [25]:
# adj_cos_corr_numerator = item_subset.groupby('anime_id').apply(
#    lambda x: np.dot(x['normalized_score_x'], x['normalized_score_y']))
# adj_cos_corr_denom = filtered_df.groupby('anime_id').apply(
#    lambda x: np.sqrt(np.dot(x['normalized_score'], x['normalized_score'])))
# adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
# adj_cos_corrs = pd.DataFrame((adj_cos_corr_numerator / adj_cos_corr_denom), columns=['corr'])
# adj_cos_corrs = adj_cos_corrs.dropna()

In [26]:
# if correlation_type == 'pearson':
#    raw_corrs = pearson_corrs
# else:
#    raw_corrs = adj_cos_corrs

In [27]:
# corr_sizes = item_subset.groupby('anime_id').size()
# corrs = raw_corrs.merge(pd.DataFrame(corr_sizes, columns = ['Size']), on='anime_id')
# if use_negative_corr:
#    corrs['similarity'] = corrs['corr'].abs()
# else:
#    corrs['similarity'] = corrs['corr']
# corrs = corrs.loc[lambda x: x['Size'] >= 10]
# corrs = corrs.sort_values(by='similarity').dropna()
# corrs = corrs.drop(recommendee) # Technically not needed because its a noop for new series, but its useful for debugging

# What is it most similar to?

In [28]:
corrs = corrs.sort_values(by="similarity")

In [29]:
pd.set_option("display.max_rows", None)
similarity = corrs.groupby("anime_id_x").apply(
    lambda x: x[-3:].merge(
        anime, left_on="anime_id_y", right_on="anime_id", suffixes=("", "_x")
    )
)
similarity = (
    similarity.merge(
        anime, left_on="anime_id_x", right_on="anime_id", suffixes=("", "_x")
    )
    .set_index(["title_x"])
    .drop("anime_id_x", axis=1)
)
display(similarity)
pd.set_option("display.max_rows", 10)

Unnamed: 0_level_0,corr,Size,similarity,anime_id,title
title_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Seikai no Monshou,0.467159,2847,0.467159,398,Seikai no Senki III
Seikai no Monshou,0.530285,3428,0.530285,397,Seikai no Senki II
Seikai no Monshou,0.592992,4043,0.592992,396,Seikai no Senki
Nekojiru-sou,0.162559,1413,0.162559,1062,Nekojiru Gekijou Jirujiru Original
Nekojiru-sou,0.164607,2564,0.164607,875,Mind Game
Nekojiru-sou,0.18531,1587,0.18531,3326,Inaka Isha
Suzumiya Haruhi no Yuuutsu,0.149973,47096,0.149973,1887,Lucky☆Star
Suzumiya Haruhi no Yuuutsu,0.267415,52013,0.267415,4382,Suzumiya Haruhi no Yuuutsu (2009)
Suzumiya Haruhi no Yuuutsu,0.284227,44123,0.284227,7311,Suzumiya Haruhi no Shoushitsu
One Outs,0.130143,5653,0.130143,10271,Gyakkyou Burai Kaiji: Hakairoku-hen


# What would I rate the series as?

In [30]:
# recommendee_user = 'Fro116'
# num_favorites = 20

In [31]:
# score = (filtered_df.merge(corrs.reset_index(), left_on = 'anime_id', right_on = 'anime_id_y')).dropna()

In [32]:
# neighborhood_size = 20
# recommendee_user_score = score.groupby('anime_id_x').apply(lambda x:
#    x.loc[lambda y: y['username'] == recommendee_user].sort_values(
#        by='similarity')[-neighborhood_size:]
# )

In [33]:
# recommendee_user_score = recommendee_user_score.reset_index(drop=True).set_index('anime_id_x')

In [34]:
# delta = recommendee_user_score.groupby('anime_id_x').apply(lambda x: np.dot(x['normalized_score'], x['corr']))
# weight = recommendee_user_score.groupby('anime_id_x').apply(lambda x: x['corr'].abs().sum())
# count = recommendee_user_score.groupby('anime_id_x')['corr'].size()
# recommendee_means = filtered_df.loc[recommendees].groupby('anime_id')['my_score'].mean()
# recommendee_stds = filtered_df.loc[recommendees].groupby('anime_id')['my_score'].std()
# pred_score = delta * recommendee_stds + recommendee_means