In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
from tqdm import tqdm


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
# Change this to get recommendations for a different user
recommendee = "Fro116"

In [3]:
os.chdir("../../data")

In [4]:
@functools.lru_cache()
def get_data():
    raw_df = pd.read_csv("UserAnimeList.csv")
    filtered_df = raw_df[["username", "anime_id", "my_score"]].loc[
        lambda x: x["my_score"] != 0
    ]

    def read_xml(file, username):
        import xml.etree.ElementTree as ET

        xml_data = open(file, "r").read()  # Read file
        root = ET.XML(xml_data)  # Parse XML

        data = []
        cols = []
        for i, child in enumerate(root):
            data.append([subchild.text for subchild in child])
            cols.append(child.tag)
        new_list = pd.DataFrame(data).T
        new_list.columns = cols

        df = (
            new_list.loc[[0, 9]]
            .T.dropna()
            .rename({0: "anime_id", 9: "my_score"}, axis=1)
        )
        df["username"] = username
        df["anime_id"] = df["anime_id"].astype(int)
        df["my_score"] = df["my_score"].astype(int)
        df["username"] = df["username"].astype(str)
        df = df.loc[lambda x: x["my_score"] != 0]
        df = df.reset_index(drop=True)
        return df

    def add_user(full_df, xml_file, username):
        user_df = read_xml(xml_file, username)
        without_user = full_df.loc[lambda x: x["username"] != username]
        return pd.concat([without_user, user_df], ignore_index=True)

    filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")
    average_rating = filtered_df["my_score"].mean()
    user_bias = (
        pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
            {"my_score": "user_bias"}, axis=1
        )
        - average_rating
    )
    anime_bias = (
        pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
            {"my_score": "anime_bias"}, axis=1
        )
        - average_rating
    )

    filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
        user_bias, on=["username"]
    )
    filtered_df["blp"] = (
        filtered_df["anime_bias"] + filtered_df["user_bias"] + average_rating
    )
    filtered_df["normalized_score"] = filtered_df["my_score"] - filtered_df["blp"]
    filtered_df = filtered_df.set_index("username")
    filtered_df = filtered_df.dropna()
    return filtered_df

In [5]:
def get_delta(score):
    return score.groupby("anime_id").apply(
        lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
    )

In [6]:
def get_delta_variance(score):
    # The following formulae are used to compute the variance of the delta. Delta
    # is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
    # a vector scores and w_i is the weight.
    #
    # By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
    # Var(s_i) is the same as the variance over all items s_i has rated). We treat
    # w_i as a random variable with mean w_i and variance corr['corr_var']
    #
    # The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
    # See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
    # formula for the ratio of two correlated variables R,S is
    # Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
    #
    # Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
    def correction_factor(x):
        return (
            1
            + x["corr_var"] / (x["corr"] ** 2)
            - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
            + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
        )

    delta_var = score.groupby("anime_id").apply(
        lambda x: np.sum(
            x["normalized_score_var"] * x["corr"] ** 2 * correction_factor(x)
        )
        / (x["corr"].abs().sum() ** 2)
    )

    # if the var < 0, then the ratio distribution approximation failed,
    # usually because sample size is too small
    delta_var.loc[lambda x: x < 0] = np.inf

    # The above is a biased estimator of the variance. To unbias the estimator,
    # we need to apply a Bessel-like correction. See the formula in
    # (https://stats.stackexchange.com/questions/47325/bias-correction-in-weighted-variance)
    bias_correction = (
        score.set_index("anime_id")
        .loc[score.groupby("anime_id").size() > 1]
        .groupby("anime_id")
        .apply(
            lambda x: (x["corr"].abs().sum() ** 2)
            / (x["corr"].abs().sum() ** 2 - (x["corr"] ** 2).sum())
        )
    )
    delta_var *= bias_correction
    return delta_var

In [7]:
def get_deltas(is_df, anime_ids, recommendee, neighborhood_size, score_fn):
    # get the neighborhood for each item
    score = score_fn(is_df, recommendee, neighborhood_size)

    # extract model features
    pred_df = pd.DataFrame()
    pred_df["delta"] = get_delta(score)
    pred_df["delta_var"] = get_delta_variance(score)
    pred_df = pred_df.loc[lambda x: x.index.isin(anime_ids)]

    # fill in missing predictions with nan
    for anime_id in set(anime_ids) - set(pred_df.index):
        pred_df = pred_df.append(pd.Series(name=anime_id, dtype=float))
    return pred_df

In [8]:
@functools.lru_cache()
def get_item_corrs():
    corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))
    corrs["similarity"] = corrs["corr"].abs()
    corrs = corrs.dropna()
    corrs = corrs.loc[
        lambda x: x.index.get_level_values("anime_id_x")
        != x.index.get_level_values("anime_id_y")
    ]
    corrs = corrs.sort_values(by="similarity")
    return corrs


def get_item_scores(df, recommendee, neighborhood_size):
    corrs = get_item_corrs()
    corrs = corrs.groupby("anime_id_x").tail(neighborhood_size)
    score = df.loc[recommendee].merge(
        corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y",
    )

    user_var = (
        pd.DataFrame(df.groupby("username")["normalized_score"].var())
        .rename({"normalized_score": "user_var"}, axis=1)
        .dropna()
    )
    score["normalized_score_var"] = user_var.loc[recommendee].squeeze()
    score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)

    return score

In [9]:
def get_user_corrs(df, recommendee):
    user_subset = df.loc[[recommendee]].merge(df.reset_index(), on="anime_id")
    corr_numerator = user_subset.groupby("username").apply(
        lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
    )
    corr_denom = df.groupby("username").apply(
        lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
    )
    corr_denom *= corr_denom.loc[recommendee]
    corrs = pd.DataFrame((corr_numerator / corr_denom), columns=["corr"])
    corrs["similarity"] = corrs["corr"].abs()
    corrs["corr_size"] = user_subset.groupby("username").size()
    corrs = corrs.drop(recommendee)    
    corrs = corrs.dropna()
    return corrs


def get_user_scores(df, recommendee, neighborhood_size):
    corrs = get_user_corrs(df, recommendee)

    # We assume variance is the same as the variance for pearson correlation.
    # see https://www.jstor.org/stable/2277400?seq=1
    corrs = corrs.loc[lambda x: x["corr_size"] > 2]
    corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (
        corrs["corr_size"] - 2
    )
    corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

    score = (df.merge(pd.DataFrame(corrs), on="username")).dropna()

    user_var = (
        pd.DataFrame(df.groupby("username")["normalized_score"].var())
        .rename({"normalized_score": "normalized_score_var"}, axis=1)
        .dropna()
    )
    score = score.merge(user_var, on="username")

    return score

In [10]:
def store_deltas(recommendee, neighborhood_size, score_fn, delta_name):
    df = get_data()

    # compute cross-validated deltas
    oos_pred_dfs = []
    K = len(df.loc[recommendee])
    np.random.seed(1)
    splits = np.array_split(df.loc[recommendee].sample(frac=1), K)
    for split in tqdm(splits):
        oos_df = split
        is_df = df.loc[
            lambda x: ~(
                (x.index.get_level_values("username") == recommendee)
                & x.anime_id.isin(oos_df.anime_id)
            )
        ]
        oos_pred_df = get_deltas(
            is_df=is_df,
            anime_ids=list(oos_df.anime_id),
            recommendee=recommendee,
            neighborhood_size=neighborhood_size,
            score_fn=score_fn,
        )
        oos_pred_dfs.append(oos_pred_df)
    oos_pred_df = pd.concat(oos_pred_dfs)

    # compute deltas over the full data
    is_pred_df = get_deltas(
        is_df=df,
        anime_ids=list(df.anime_id),
        recommendee=recommendee,
        neighborhood_size=neighborhood_size,
        score_fn=score_fn,
    )

    # store deltas
    outdir = f"deltas/{recommendee}"
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    oos_pred_df.to_pickle(os.path.join(outdir, f"{delta_name}_oos.pkl"))
    is_pred_df.to_pickle(os.path.join(outdir, f"{delta_name}_is.pkl"))

In [11]:
def store_baselines(recommendee):
    df = get_data()
    df.loc[recommendee][['anime_id', 'normalized_score']].to_pickle(f'deltas/{recommendee}/recommendee.pkl')
    
    average_rating = df["my_score"].mean()
    user_bias = (
        pd.DataFrame(df.groupby("username")["my_score"].mean()).rename(
            {"my_score": "user_bias"}, axis=1
        )
        - average_rating
    )
    anime_bias = (
        pd.DataFrame(df.groupby("anime_id")["my_score"].mean()).rename(
            {"my_score": "anime_bias"}, axis=1
        )
        - average_rating
    )
    blp = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
    blp = blp.rename({'anime_bias': 'blp'}, axis=1)    
    blp.to_pickle(f'deltas/{recommendee}/blp.pkl')

In [12]:
store_deltas(
    recommendee=recommendee,
    neighborhood_size=64,
    score_fn=get_item_scores,
    delta_name="item",
)

 49%|████▉     | 185/375 [3:42:25<3:48:26, 72.14s/it]  
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/kundan/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-1e51958603d8>", line 1, in <module>
    store_deltas(
  File "<ipython-input-10-a7aeca2855b1>", line 17, in store_deltas
    oos_pred_df = get_deltas(
  File "<ipython-input-7-ef7d89086c96>", line 8, in get_deltas
    pred_df["delta_var"] = get_delta_variance(score)
  File "<ipython-input-6-3eb50c9cedee>", line 39, in get_delta_variance
    score.set_index("anime_id")
  File "/Users/kundan/opt/anaconda3/lib/python3.8/site-packages/pandas/core/groupby/groupby.py", line 859, in apply
    result = self._python_apply_general(f, self._selected_obj)
  File "/Users/kundan/opt/anaconda3/lib/python3.8/site-packages/pandas/core/groupby/groupby.py", line 892, in _python_apply_general
    keys, values, mutated = self.grouper.apply(f, data, self.axis)
  File "/Users

TypeError: object of type 'NoneType' has no len()

In [None]:
store_deltas(
    recommendee=recommendee,
    neighborhood_size=8192,
    score_fn=get_user_scores,
    delta_name="user",
)

 19%|█▉        | 71/375 [2:10:21<9:09:06, 108.38s/it] 

In [None]:
store_baselines(recommendee=recommendee)

In [None]:
# TODO figure out how to make loocv faster