In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from scipy.special import gamma
from tqdm import tqdm


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../../data/processed_data")

In [3]:
recommendee = "Fro116"

In [4]:
anime = pd.read_csv("../cleaned_data/anime.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pickle.load(open("user_anime_lists.pkl", "rb"))
user_df = pickle.load(open(f"../../data/recommendations/{recommendee}/user_anime_list.pkl", "rb"))
df = pd.concat([df.loc[lambda x: ~x.username.isin(user_df.username)], user_df], ignore_index=True)

In [6]:
anime = pd.read_csv("../sandbox/anime.csv")

In [7]:
anime.set_index("anime_id").loc[1177]

title                                                        Alien 9
medium                                                           ova
related_anime                                                     []
recommendations    [{'anime_id': 30, 'num_recommendations': 6}, {...
genres               ['Sci-Fi', 'Horror', 'Psychological', 'School']
source                                                         manga
num_episodes                                                       4
status                                               finished_airing
Name: 1177, dtype: object

In [8]:
rec_dfs = []
for i, row in tqdm(anime.iterrows(), total=len(anime)):
    rec_df = pd.DataFrame.from_records(eval(row["recommendations"]))
    rec_df["target"] = row["anime_id"]
    rec_df = rec_df.rename({"anime_id": "source"}, axis=1)
    rec_dfs.append(rec_df)
rec_df = pd.concat(rec_dfs, ignore_index=True).astype(int)

100%|██████████| 17871/17871 [00:15<00:00, 1158.43it/s]


In [9]:
rec_df_flipped = rec_df.rename({"source": "target", "target": "source"}, axis=1)

In [10]:
rec_df_full = pd.concat([rec_df, rec_df_flipped], ignore_index=True)
rec_df_full = rec_df_full.groupby(["source", "target"]).sum().reset_index()

In [11]:
user_df = df.loc[lambda x: x["username"] == recommendee]

In [12]:
user_recs = (
    user_df.set_index("anime_id")
    .merge(rec_df, left_on=["anime_id"], right_on=["source"])
    .drop("source", axis=1)
    .rename({"target": "anime_id"}, axis=1)
)

In [41]:
α = 1
pred_scores = user_recs.groupby("anime_id").apply(
    lambda x: np.dot(x["score"], x["num_recommendations"] ** α)
    / (x["num_recommendations"]** α).sum()
)

In [42]:
pred_df = user_df.set_index("anime_id")
pred_df["pred_score"] = pred_scores

In [43]:
len(pred_df.dropna()), len(pred_df)

(298, 349)

In [44]:
print(lm("score ~ pred_score + 0", pred_df.fillna(0)).summary())

                                 OLS Regression Results                                
Dep. Variable:                  score   R-squared (uncentered):                   0.014
Model:                            OLS   Adj. R-squared (uncentered):              0.012
Method:                 Least Squares   F-statistic:                              5.099
Date:                Mon, 24 May 2021   Prob (F-statistic):                      0.0246
Time:                        09:50:47   Log-Likelihood:                         -679.76
No. Observations:                 349   AIC:                                      1362.
Df Residuals:                     348   BIC:                                      1365.
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [45]:
pred_df.loc[634]

username        Fro116
score         2.768929
score_var     2.786994
pred_score    0.158903
Name: 634, dtype: object