In [1]:
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title"]]

In [3]:
df = pd.read_csv("UserAnimeList.csv")

In [4]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [5]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [6]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [7]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)

In [8]:
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)

In [9]:
filtered_df = filtered_df.set_index("anime_id")

In [10]:
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,username,my_score,anime_bias,user_bias,normalized_score
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21,karthiga,9,0.960559,-0.059903,0.605479
59,karthiga,7,0.040198,-0.059903,-0.474160
74,karthiga,7,0.316277,-0.059903,-0.750239
120,karthiga,7,0.309853,-0.059903,-0.743815
178,karthiga,7,-0.227344,-0.059903,-0.206618
...,...,...,...,...,...
10040,temptemptemp,6,-1.636723,-1.493866,1.636723
12963,cinnamoroller,10,-0.798866,2.506134,0.798866
5143,inactiveX,7,-0.652956,-0.493866,0.652956
5581,omgm,5,-1.857502,-2.493866,1.857502


In [51]:
def get_corrs(recommendees):
    item_subset = (
        filtered_df.loc[recommendees]
        .reset_index()
        .merge(filtered_df.reset_index(), on="username")
    )

    adj_cos_corr_numerator = item_subset.groupby(["anime_id_x", "anime_id_y"]).apply(
        lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
    )
    adj_cos_corr_denom = filtered_df.groupby("anime_id").apply(
        lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
    )
    x_length = adj_cos_corr_denom[recommendees]
    x_length.index.rename("anime_id_x", inplace=True)
    y_length = adj_cos_corr_denom
    y_length.index.rename("anime_id_y", inplace=True)
    adj_cos_corr = adj_cos_corr_numerator / x_length / y_length
    adj_cos_corr = pd.DataFrame(adj_cos_corr, columns=["corr"]).dropna()

    raw_corrs = adj_cos_corr
    corr_sizes = item_subset.groupby(["anime_id_x", "anime_id_y"]).size()
    corrs = raw_corrs.merge(
        pd.DataFrame(corr_sizes, columns=["Size"]), on=["anime_id_x", "anime_id_y"]
    )
    corrs["similarity"] = corrs["corr"].abs()
    corrs = corrs.sort_values(by="similarity").dropna()
    return corrs[["corr"]]

In [35]:
anime_ids = sorted(list(set(filtered_df.index)))

In [52]:
# Store correlations on disk. We need to do this in chunks
# for memory reasons
chunks = np.array_split(anime_ids, len(anime_ids) / 10)
for i, chunk in tqdm(enumerate(chunks)):
    corr = get_corrs(chunk)
    corr.to_pickle(f"item_correlations/{i}.pkl")

1139it [28:02:27, 88.63s/it] 


In [14]:
corr_dfs = []
for item in tqdm(os.listdir("item_correlations")):
    path = os.path.join("item_correlations", item)
    corr_dfs.append(pickle.load(open(path, "rb")))

100%|██████████| 1405/1405 [00:08<00:00, 163.66it/s]


In [15]:
corr_df = pd.concat(corr_dfs)

In [16]:
corr_df.to_pickle(f"item_correlations/correlations.pkl")