# Item Correlations
* Computes the item correlation matrix and saves it to disk
* The correlation two items is given by the adjusted cosine correlation of user scores.

In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
os.chdir("../../data/processed_data")

In [3]:
df = pickle.load(open("user_anime_lists.pkl", "rb"))
df = df[["anime_id", "username", "score"]]

In [4]:
@functools.lru_cache()
def adj_cos_corr_denominator():
    return df.groupby("anime_id").apply(
        lambda x: np.sqrt(np.dot(x["score"], x["score"]))
    )


def get_corrs(df, anime_ids):
    item_subset = df.loc[anime_ids].reset_index().merge(df.reset_index(), on="username")

    # compute the adjusted cosine correlation
    item_subset["adj_cor_numerator"] = item_subset["score_x"] * item_subset["score_y"]
    adj_cos_corr_numerator = item_subset.groupby(["anime_id_x", "anime_id_y"])[
        "adj_cor_numerator"
    ].sum()
    x_length = adj_cos_corr_denominator()[anime_ids]
    x_length.index.rename("anime_id_x", inplace=True)
    y_length = adj_cos_corr_denominator()
    y_length.index.rename("anime_id_y", inplace=True)
    adj_cos_corr = adj_cos_corr_numerator / x_length / y_length
    adj_cos_corr = pd.DataFrame(adj_cos_corr, columns=["corr"]).dropna()

    # We approximate the variance as the variance for pearson correlation.
    # see https://www.jstor.org/stable/2277400?seq=1
    corr_sizes = item_subset.groupby(["anime_id_x", "anime_id_y"]).size()
    corrs = adj_cos_corr.merge(
        pd.DataFrame(corr_sizes, columns=["size"]), on=["anime_id_x", "anime_id_y"]
    )
    corrs = corrs.loc[lambda x: x["size"] > 2]
    corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)
    corrs = corrs.dropna()
    return corrs[["corr", "corr_var", "size"]]

In [5]:
# create the output directory
outdir = "item_correlations"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [6]:
# Store correlations on disk. We need to do this in chunks to prevent running out of memory
# Note: The first iterations take longer than later iterations because of cache warming
# Note: increasing parallelism will make the program run faster but will use more memory
anime_ids = sorted(list(set(df["anime_id"])))
parallelism = 50
chunks = np.array_split(anime_ids, int(len(anime_ids) / parallelism))
for i, chunk in tqdm(enumerate(chunks), total=len(chunks)):
    get_corrs(df, chunk).to_pickle(os.path.join(outdir, f"{i}.pkl"))

100%|████████████████████████████████████████████████████████████████████████████████| 316/316 [31:28<00:00,  5.98s/it]


In [21]:
# combine all chunks into a single file
corr_dfs = []
for item in tqdm(os.listdir(outdir)):
    if not re.match("[0-9]+.pkl", item):
        print(f"Found unrecognized file {item} in {outdir}")
        continue
    path = os.path.join(outdir, item)
    corr_dfs.append(pickle.load(open(path, "rb")))
corr_df = pd.concat(corr_dfs)

100%|██████████████████████████████████████████████████████████████████████████████| 316/316 [00:00<00:00, 1395.80it/s]


In [22]:
corr_df.to_pickle("item_correlations.pkl")