# Recommendations Graph
* We construct a graph of recommendations
* The vertices are anime ids and the number of edges between anime_i and anime_j is the number of recommendations between then
* We get the adjacency matrix of this graph, normalize it, compute powers, and store it to disk

In [1]:
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse

In [2]:
os.chdir("../../data/processed_data")

In [3]:
anime = pd.read_csv("../cleaned_data/anime.csv")

In [4]:
rec_dfs = []
for i, row in tqdm(anime.iterrows(), total=len(anime)):
    rec_df = pd.DataFrame.from_records(eval(row["recommendations"]))
    rec_df["target"] = row["anime_id"]
    rec_df = rec_df.rename({"anime_id": "source"}, axis=1)
    rec_dfs.append(rec_df)
rec_df = pd.concat(rec_dfs, ignore_index=True).astype(int)

100%|██████████| 17871/17871 [00:19<00:00, 917.49it/s] 


In [5]:
if rec_df.empty:
    rec_df = pd.DataFrame.from_dict({'source': [], 'target': [], 'num_recommendations': []})

In [6]:
rec_df.to_pickle("mal_user_recs_raw.pkl")

In [7]:
def make_symmetric(rec_df):
    rec_df_flipped = rec_df.rename({"source": "target", "target": "source"}, axis=1)
    rec_df_full = pd.concat([rec_df, rec_df_flipped], ignore_index=True)
    rec_df_full = rec_df_full.groupby(["source", "target"]).sum().reset_index()
    rec_df = rec_df_full
    return rec_df

In [8]:
def normalize_edges(df):
    df = df.merge(df.groupby('source')['weight'].sum().rename('source_weight'), on='source')
    df = df.merge(df.groupby('target')['weight'].sum().rename('target_weight'), on='target')
    df['weight'] = df['weight'] / df['source_weight'] / df['target_weight'] 
    df = df.drop({'source_weight', 'target_weight'}, axis=1)
    return df

In [9]:
def to_csr_mat(df):
    # handle empty dataframes gracefully
    shape = None
    if df.empty:
        shape = (1, 1)
    return scipy.sparse.csr_matrix((df['weight'], (df['source'], df['target'])), shape)

In [10]:
rec_df["weight"] = rec_df["num_recommendations"]
rec_df = rec_df.drop("num_recommendations", axis=1)
rec_df = make_symmetric(rec_df)
rec_df = normalize_edges(rec_df)

In [11]:
max_power = 10
for i in tqdm(range(1, max_power)):
    rec_df.to_pickle(f"mal_user_recs_pow_{i}.pkl")    
    weight_sum = rec_df['weight'].sum()
    adj_mat = to_csr_mat(rec_df)
    adj_mat = adj_mat @ adj_mat
    mat = adj_mat.tocoo() 
    rec_df = pd.DataFrame({'source': mat.row, 'target': mat.col, 'weight': mat.data})
    rec_df = rec_df.loc[lambda x: x['source'] != x['target']]
    rec_df = make_symmetric(rec_df)    
    rec_df = normalize_edges(rec_df)
    rec_df['weight'] /= weight_sum
rec_df.to_pickle(f"mal_user_recs_pow_{max_power}.pkl")    

100%|██████████| 9/9 [55:36<00:00, 370.70s/it]
