In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
np.random.seed(0)

In [3]:
os.chdir("..")

In [4]:
raw_df = pd.read_csv("UserAnimeList.csv")

KeyboardInterrupt: 

In [None]:
filtered_df = raw_df[["username", "anime_id", "my_score"]].loc[
    lambda x: x["my_score"] != 0
]

In [None]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [None]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [None]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [None]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df["orig_normalized_score"] = filtered_df["normalized_score"]
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [None]:
def prepare_prediction(recommendee, neighborhood):
    pred_df = pd.DataFrame()
    pred_df["delta"] = neighborhood.groupby("anime_id").apply(
        lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
    )
    pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
    pred_df = pred_df.dropna()
    return pred_df

In [None]:
def get_squared_error(df, pred_df, recommendee):
    recommendee_df = pred_df.loc[
        pred_df.index.intersection(df.loc[recommendee].anime_id)
    ]
    recommendee_df = recommendee_df.merge(
        df.loc[recommendee].set_index("anime_id")["my_score"], on="anime_id"
    )
    errors = recommendee_df["my_score"] - recommendee_df["score"]
    return np.dot(errors, errors)

In [None]:
filtered_df.head()

In [None]:
def compute_accuracy_metrics(
    is_df, oos_df, recommendee, neighborhood_sizes, full_neighborhoods
):

    all_corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))
    all_corrs["similarity"] = all_corrs["corr"].abs()
    all_corrs = all_corrs.dropna()
    all_corrs = all_corrs.loc[
        lambda x: x.index.get_level_values("anime_id_x")
        != x.index.get_level_values("anime_id_y")
    ]

    anime_var = (
        pd.DataFrame(is_df.groupby("anime_id")["normalized_score"].var())
        .rename({"normalized_score": "anime_var"}, axis=1)
        .dropna()
    )
    user_var = (
        pd.DataFrame(is_df.groupby("username")["normalized_score"].var())
        .rename({"normalized_score": "user_var"}, axis=1)
        .dropna()
    )

    metrics = pd.DataFrame()
    for neighborhood_size in neighborhood_sizes:
        # extract model features
        corrs = all_corrs.copy()
        if full_neighborhoods:
            corrs = corrs.groupby("anime_id_x").apply(
                lambda x: x.sort_values(by="similarity")
            )
        else:
            corrs = corrs.groupby("anime_id_x").apply(
                lambda x: x.sort_values(by="similarity")[-neighborhood_size:]
            )
        corrs.index = corrs.index.droplevel()

        score = is_df.loc[recommendee].merge(
            corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y"
        )
        score["user_var"] = user_var.loc["Fro116"].squeeze()
        score = score.merge(anime_var, on="anime_id")
        score = score.drop("anime_id", axis=1).rename(
            {"anime_id_x": "anime_id"}, axis=1
        )

        if full_neighborhoods:
            score = (
                score.groupby("anime_id")
                .apply(lambda x: x.sort_values(by="similarity")[-neighborhood_size:])
                .reset_index(drop=True)
            )
        pred_df = prepare_prediction(recommendee, score)

        # train linear model
        recomendee_seen_shows = is_df.loc[recommendee].merge(pred_df, on=["anime_id"])
        recomendee_seen_shows["target"] = (
            recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
        )
        model = lm("target ~ delta + 0", recomendee_seen_shows)

        # inference
        pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
        is_pred_df = pred_df.loc[
            lambda x: x.index.isin(is_df.loc[recommendee].anime_id)
        ]
        oos_pred_df = pred_df.loc[lambda x: x.index.isin(oos_df.anime_id)]

        # compute coverage
        is_coverage = len(is_pred_df) / len(is_df.loc[recommendee])
        oos_coverage = len(oos_pred_df) / len(oos_df)

        # compute rmse
        missing_is = is_df.loc[recommendee].loc[
            lambda x: ~x.anime_id.isin(is_pred_df.index)
            & ~x.anime_id.isin(oos_df.anime_id)
        ]
        missing_oos = oos_df.loc[lambda x: ~x.anime_id.isin(oos_pred_df.index)]
        is_se = get_squared_error(is_df, is_pred_df, recommendee)
        oos_se = get_squared_error(oos_df, oos_pred_df, recommendee)
        missing_is_se = np.dot(
            missing_is["orig_normalized_score"], missing_is["orig_normalized_score"]
        )
        missing_oos_se = np.dot(
            missing_oos["orig_normalized_score"], missing_oos["orig_normalized_score"]
        )
        is_rmse = np.sqrt((is_se + missing_is_se) / len(is_df.loc[recommendee]))
        oos_rmse = np.sqrt((oos_se + missing_oos_se) / len(oos_df))
        metrics = metrics.append(
            {
                "neighborhood_size": neighborhood_size,
                "full_neighborhoods": full_neighborhoods,
                "is_rmse": is_rmse,
                "is_coverage": is_coverage,
                "oos_rmse": oos_rmse,
                "oos_coverage": oos_coverage,
            },
            ignore_index=True,
        )

    return metrics

In [None]:
%%time
from tqdm import tqdm

recommendee = "Fro116"
K = 10
base = np.sqrt(2)

errors_by_neighborhood_size = []
max_size = len(filtered_df.anime_id.unique())
neighborhood_sizes = [
    int(base ** i) for i in range(int(np.log(max_size) / np.log(base)) + 1)
] + [max_size]
neighborhood_sizes = sorted(list(set(neighborhood_sizes)))
splits = np.array_split(filtered_df.loc[recommendee].sample(frac=1), K)
for split in splits:
    display(split.head())
    
for split in tqdm(splits):
    oos_df = split
    is_df = filtered_df.loc[
        lambda x: ~(
            (x.index.get_level_values("username") == recommendee)
            & x.anime_id.isin(oos_df.anime_id)
        )
    ]
    # we take copies as a safety precaution
    errors_by_neighborhood_size.append(
        compute_accuracy_metrics(
            is_df.copy(), oos_df.copy(), "Fro116", neighborhood_sizes, False
        )
    )
    errors_by_neighborhood_size.append(
        compute_accuracy_metrics(
            is_df.copy(), oos_df.copy(), "Fro116", neighborhood_sizes, True
        )
    )

In [None]:
allerrors = pd.concat(errors_by_neighborhood_size, ignore_index=True)

In [None]:
allerrors

In [None]:
wide_data = pd.melt(allerrors, ["neighborhood_size", "full_neighborhoods"])

In [None]:
wide_data.loc[lambda x: x.full_neighborhoods == True, "variable"] = (
    "full_" + wide_data.loc[lambda x: x.full_neighborhoods == True, "variable"]
)

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("coverage")],
).set(xscale="log", title="Prediction Coverage")

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("rmse")],
).set(xscale="log", title="Root Mean Squared Error")

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["full_neighborhoods", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).head(10)

In [None]:
# My takeaway from this is that it doesn't matter if we normalize variance or not
# For simplicity, let's forgo normalization

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["full_neighborhoods", "neighborhood_size"]).mean().xs(
    0, level="full_neighborhoods"
).sort_values(by="oos_rmse").head(10)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["full_neighborhoods", "neighborhood_size"]).mean().xs(
    1, level="full_neighborhoods"
).sort_values(by="oos_rmse").head(10)