# Item Correlations
* Computes the correlation matrix of items and saves it to disk
* The correlation betwen item i and item j is given by the adjusted cosine correlation of all user scores.

In [1]:
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
os.chdir("../../data/processed_data")

In [3]:
df = pickle.load(open("user_anime_lists.pkl", "rb"))

In [4]:
df = df.set_index("anime_id")

In [5]:
def get_corrs(anime_ids):
    item_subset = (
        df.loc[anime_ids].reset_index().merge(df.reset_index(), on="username")
    )

    # compute the adjusted cosine correlation
    adj_cos_corr_numerator = item_subset.groupby(["anime_id_x", "anime_id_y"]).apply(
        lambda x: np.dot(x["score_x"], x["score_y"])
    )
    adj_cos_corr_denom = df.groupby("anime_id").apply(
        lambda x: np.sqrt(np.dot(x["score"], x["score"]))
    )
    x_length = adj_cos_corr_denom[anime_ids]
    x_length.index.rename("anime_id_x", inplace=True)
    y_length = adj_cos_corr_denom
    y_length.index.rename("anime_id_y", inplace=True)
    adj_cos_corr = adj_cos_corr_numerator / x_length / y_length
    adj_cos_corr = pd.DataFrame(adj_cos_corr, columns=["corr"]).dropna()

    # We approximate the variance as the variance for pearson correlation.
    # see https://www.jstor.org/stable/2277400?seq=1
    corr_sizes = item_subset.groupby(["anime_id_x", "anime_id_y"]).size()
    corrs = adj_cos_corr.merge(
        pd.DataFrame(corr_sizes, columns=["size"]), on=["anime_id_x", "anime_id_y"]
    )
    corrs = corrs.loc[lambda x: x["size"] > 2]
    corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)

    # sorting by |correlation| is useful for downstream tasks
    corrs["similarity"] = corrs["corr"].abs()        
    corrs = corrs.sort_values(by="similarity").dropna()
    return corrs[["corr", "corr_var", "size"]]

In [6]:
# create the output directory
outdir = 'item_correlations'
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [7]:
anime_ids = sorted(list(set(df.index)))
chunks = np.array_split(anime_ids, len(anime_ids) / 10)

In [8]:
# Store correlations on disk. We need to do this in chunks for memory reasons
# The first iterations take ~3 minutes to complete
# Note: each chunk can be computed in parallel
anime_ids = sorted(list(set(df.index)))
chunks = np.array_split(anime_ids, len(anime_ids) / 10)
offset = 0
for i, chunk in tqdm(enumerate(chunks[offset:])):
    corr = get_corrs(chunk)
    corr.to_pickle(os.path.join(outdir, f"{i+offset}.pkl"))

336it [4:03:41, 43.52s/it]  


In [11]:
# combine all chunks into a single file
corr_dfs = []
for item in tqdm(os.listdir(outdir)):
    path = os.path.join(outdir, item)
    corr_dfs.append(pickle.load(open(path, "rb")))
corr_df = pd.concat(corr_dfs)

100%|██████████| 1386/1386 [00:12<00:00, 112.60it/s]


In [13]:
corr_df.to_pickle("item_correlations.pkl")