# MAL Recommendations Graph
* We construct an undirected graph of recommendations
* The vertices are anime ids and weight of the edge between $a_i$ and $a_j$ is the number of recommendations between them
* We get the adjacency matrix of this graph and store it to disk

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
def parse_recommendations():
    # compute the recommendations graph
    anime = pd.read_csv("../../data/raw_data/anime.csv")
    if "recommendations" not in anime.columns:
        return pd.DataFrame.from_dict(
            {"source": [], "target": [], "num_recommendations": []}
        )
    rec_dfs = []
    for i, row in tqdm(anime.iterrows(), total=len(anime)):
        rec_df = pd.DataFrame.from_records(eval(row["recommendations"]))
        rec_df["target"] = row["anime_id"]
        rec_df = rec_df.rename({"anime_id": "source"}, axis=1)
        rec_dfs.append(rec_df)
    rec_df = pd.concat(rec_dfs, ignore_index=True).astype(int)

    # relabel anime ids
    anime_to_uid = pd.read_csv("../../data/processed_data/anime_to_uid.csv")
    anime_to_uid = anime_to_uid.set_index("anime_id").to_dict()["uid"]
    rec_df = rec_df.loc[
        lambda x: x.source.isin(list(anime_to_uid)) & x.target.isin(list(anime_to_uid))
    ]
    rec_df["source"].replace(anime_to_uid, inplace=True)
    rec_df["target"].replace(anime_to_uid, inplace=True)
    return rec_df

In [3]:
rec_df = parse_recommendations()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19119/19119 [00:41<00:00, 465.34it/s]


In [4]:
# Ensure that num_recommendations from x -> y is equal to num_recommendations from y -> x
# This should be almost always be true in the original data, whenever x -> y and y -> x are both present
# Exceptions occur if a recommendation was made during the scraping process
# Its possible for x -> y to be defined but not y -> x. This is because of MAL clipping entries
# with not enough recommendations
def make_symmetric(rec_df):
    rec_df_flipped = rec_df.rename({"source": "target", "target": "source"}, axis=1)
    rec_df_full = pd.concat([rec_df, rec_df_flipped], ignore_index=True)
    rec_df_full = rec_df_full.groupby(["source", "target"]).max().reset_index()
    rec_df = rec_df_full
    return rec_df

In [5]:
make_symmetric(rec_df).to_csv("../../data/processed_data/mal_recs.csv", index=False)

## Verify Symmetry Assumptions

In [6]:
def get_num_symmetry_exceptions(rec_df):
    # return the number of times that num_recommendations from x -> y
    # is not equal to num_recommendations from y -> x, whenever both are
    # defined
    exceptions = 0
    for i in tqdm(range(len(rec_df))):
        s = rec_df.iloc[i]["source"]
        t = rec_df.iloc[i]["target"]
        nr = rec_df.iloc[i]["num_recommendations"]
        transpose = rec_df.loc[lambda x: (x.source == t) & (x.target == s)]
        if len(transpose) > 0:
            assert len(transpose) == 1
            nrt = transpose.iloc[0]["num_recommendations"]
            if nr != nrt:
                exceptions += 1
    return exceptions

In [7]:
# This should be a small number
get_num_symmetry_exceptions(rec_df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33893/33893 [00:59<00:00, 573.27it/s]


2

In [8]:
assert get_num_symmetry_exceptions(make_symmetric(rec_df)) == 0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49380/49380 [01:19<00:00, 624.45it/s]
