# Related Anime Graph
* We make an undirected graph of anime, where each anime is a vertex
* Two vertices are connected if one anime is a related to the other, where valid relations include sequels, prequels, etc.
* We then compute the connected components and store the memership list

In [1]:
import os

import igraph as ig
import pandas as pd
from tqdm import tqdm

In [2]:
os.chdir("../../data/processed_data")

In [3]:
anime = pd.read_csv("../cleaned_data/anime.csv")

In [4]:
def get_related_series(data, anime_id):
    df = pd.DataFrame({"source": [], "target": [], "relationship": []}, dtype=int)
    related = eval(data)
    if not related:
        return df

    related_series = set()
    for entry in related:
        df = df.append(
            {"source": anime_id, "target": entry["anime_id"], "relationship": entry['relation']},
            ignore_index=True,
        )
    return df

In [5]:
all_related_series = []
for i in tqdm(range(len(anime))):
    anime_id = anime.iloc[i]["anime_id"]
    related_series = get_related_series(anime.iloc[i]["related_anime"], anime_id)
    all_related_series.append(related_series)
related_series_df = pd.concat(all_related_series, ignore_index=True)
related_series_df[["source", "target"]] = related_series_df[
    ["source", "target"]
].astype(int)

100%|██████████| 14478/14478 [00:44<00:00, 325.18it/s]


In [6]:
related_series_df["relationship"].unique()

array(['Side story', 'Summary', 'Parent story', 'Sequel', 'Prequel',
       'Character', 'Alternative version', 'Other', 'Spin-off',
       'Alternative setting', 'Full story'], dtype=object)

In [7]:
valid_relations = ["Sequel", "Prequel", "Parent story", "Alternative setting"]

In [8]:
g = ig.Graph.DataFrame(
    related_series_df.loc[lambda x: x["relationship"].isin(valid_relations)]
)

In [9]:
edges = related_series_df.loc[lambda x: x["relationship"].isin(valid_relations)]
vertices = pd.DataFrame.from_dict(
    {"anime_id": list(set(related_series_df["target"]) | set(anime["anime_id"]))}
)
g = ig.Graph.DataFrame(edges=edges, vertices=vertices)

In [10]:
clusters = g.components(mode="weak")

In [11]:
clusters.summary()

'Clustering with 14527 elements and 10074 clusters'

In [12]:
cluster = clusters.membership

In [13]:
vertices = [x.attributes()["name"] for x in clusters.graph.vs]

In [14]:
series = pd.DataFrame({"anime_id": vertices, "series_id": cluster})

In [15]:
# if two anime have the same series id, they are related
series

Unnamed: 0,anime_id,series_id
0,0,0
1,1,1
2,5,1
3,6,2
4,7,3
...,...,...
14522,37905,10071
14523,37908,10072
14524,37914,10073
14525,37915,7359


In [16]:
series.to_pickle("related_anime_graph.pkl")