# Recommendations Graph
* We construct a graph of recommendations
* The vertices are anime ids and the number of edges between anime_i and anime_j is the number of recommendations between then
* We get the adjacency matrix of this graph, normalize it, compute powers, and store it to disk

In [1]:
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm
import scipy.sparse

In [2]:
os.chdir("../../data/processed_data")

In [3]:
anime = pd.read_csv("../cleaned_data/anime.csv")

In [4]:
anime

Unnamed: 0,anime_id,title,genres,medium,related_anime,recommendations
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",tv,"[{'anime_id': 5, 'relation': 'side_story'}, {'...","[{'anime_id': 205, 'num_recommendations': 102}..."
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Drama', 'Mystery', 'Sci-Fi', 'Space']",movie,"[{'anime_id': 1, 'relation': 'parent_story'}]","[{'anime_id': 4106, 'num_recommendations': 3},..."
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",tv,"[{'anime_id': 4106, 'relation': 'side_story'}]","[{'anime_id': 1, 'num_recommendations': 73}, {..."
3,7,Witch Hunter Robin,"['Action', 'Mystery', 'Police', 'Supernatural'...",tv,[],"[{'anime_id': 2025, 'num_recommendations': 12}..."
4,8,Bouken Ou Beet,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",tv,"[{'anime_id': 1123, 'relation': 'sequel'}]","[{'anime_id': 136, 'num_recommendations': 2}, ..."
...,...,...,...,...,...,...
17866,48992,Aru Nihon no Ekaki Shounen,"['Comedy', 'Dementia']",movie,[],[]
17867,48997,Fantasy Bishoujo Juniku Ojisan to,"['Adventure', 'Comedy', 'Fantasy']",tv,[],[]
17868,49014,Daisan no Shinzou,['Music'],music,[],"[{'anime_id': 30771, 'num_recommendations': 1}..."
17869,49020,Jashin Shoukan: Inran Kyonyuu Oyako Ikenie Gis...,['Hentai'],ova,[],[]


In [5]:
rec_dfs = []
for i, row in tqdm(anime.iterrows(), total=len(anime)):
    rec_df = pd.DataFrame.from_records(eval(row["recommendations"]))
    rec_df["target"] = row["anime_id"]
    rec_df = rec_df.rename({"anime_id": "source"}, axis=1)
    rec_dfs.append(rec_df)
rec_df = pd.concat(rec_dfs, ignore_index=True).astype(int)

100%|███████████████████████████████████████████████████████████████████████████| 17871/17871 [00:18<00:00, 989.72it/s]


In [6]:
if rec_df.empty:
    rec_df = pd.DataFrame.from_dict({'source': [], 'target': [], 'num_recommendations': []})

In [7]:
rec_df.to_pickle("mal_user_recs_raw.pkl")

In [8]:
def make_symmetric(rec_df):
    rec_df_flipped = rec_df.rename({"source": "target", "target": "source"}, axis=1)
    rec_df_full = pd.concat([rec_df, rec_df_flipped], ignore_index=True)
    rec_df_full = rec_df_full.groupby(["source", "target"]).sum().reset_index()
    rec_df = rec_df_full
    return rec_df

In [9]:
def normalize_edges(df):
    df = df.merge(df.groupby('source')['weight'].sum().rename('source_weight'), on='source')
    df = df.merge(df.groupby('target')['weight'].sum().rename('target_weight'), on='target')
    df['weight'] = df['weight'] / df['source_weight'] / df['target_weight'] 
    df = df.drop({'source_weight', 'target_weight'}, axis=1)
    return df

In [10]:
def to_csr_mat(df):
    # handle empty dataframes gracefully
    shape = None
    if df.empty:
        shape = (1, 1)
    return scipy.sparse.csr_matrix((df['weight'], (df['source'], df['target'])), shape)

In [11]:
rec_df["weight"] = rec_df["num_recommendations"]
rec_df = rec_df.drop("num_recommendations", axis=1)
rec_df = make_symmetric(rec_df)
rec_df = normalize_edges(rec_df)

In [None]:
max_power = 10
for i in tqdm(range(1, max_power)):
    rec_df.to_pickle(f"mal_user_recs_pow_{i}.pkl")    
    weight_sum = rec_df['weight'].sum()
    adj_mat = to_csr_mat(rec_df)
    adj_mat = adj_mat @ adj_mat
    mat = adj_mat.tocoo() 
    rec_df = pd.DataFrame({'source': mat.row, 'target': mat.col, 'weight': mat.data})
    rec_df = rec_df.loc[lambda x: x['source'] != x['target']]
    rec_df = make_symmetric(rec_df)    
    rec_df = normalize_edges(rec_df)
    rec_df['weight'] /= weight_sum
rec_df.to_pickle(f"mal_user_recs_pow_{max_power}.pkl")    

 67%|███████████████████████████████████████████████████████▎                           | 6/9 [46:38<32:46, 655.52s/it]