In [1]:
import ast
import os
import time

import pandas as pd
import requests
from tqdm import tqdm

import igraph as ig

In [2]:
os.chdir("../../data/")

In [3]:
anime = pd.read_csv("AnimeList.csv")

In [4]:
def get_related_series(data, anime_id):
    df = pd.DataFrame({"source": [], "target": [], "relationship": []})
    related = ast.literal_eval(data)
    if not related:
        return df

    related_series = set()
    for key, val in related.items():
        for item in val:
            if item["type"] == "anime":
                df = df.append(
                    {"source": anime_id, "target": item["mal_id"], "relationship": key}
                    , ignore_index=True
                )
    return df

In [5]:
all_related_series = []
for i in tqdm(range(len(anime))):
    anime_id = anime.iloc[i]["anime_id"]
    related_series = get_related_series(anime.iloc[i]["related"], anime_id)
    all_related_series.append(related_series)
related_series_df = pd.concat(all_related_series, ignore_index=True)
related_series_df[['source', 'target']] = related_series_df[['source', 'target']].astype(int)

100%|██████████| 14478/14478 [00:45<00:00, 315.84it/s]


In [6]:
related_series_df.head()

Unnamed: 0,source,target,relationship
0,11013,13403,Sequel
1,2104,4163,Sequel
2,2104,5199,Side story
3,2104,28385,Alternative version
4,5262,2923,Prequel


In [7]:
related_series_df.loc[lambda x: (x['source'] == 36791) | (x['target'] == 36791)]

Unnamed: 0,source,target,relationship
4909,36790,36791,Sequel
7617,36791,36790,Prequel
15815,37894,36791,Prequel


In [8]:
related_series_df['relationship'].unique()

array(['Sequel', 'Side story', 'Alternative version', 'Prequel',
       'Summary', 'Other', 'Spin-off', 'Alternative setting', 'Character',
       'Parent story', 'Full story'], dtype=object)

In [18]:
related_series_df

Unnamed: 0,source,target,relationship
0,11013,13403,Sequel
1,2104,4163,Sequel
2,2104,5199,Side story
3,2104,28385,Alternative version
4,5262,2923,Prequel
...,...,...,...
15848,28569,26345,Prequel
15849,26089,26087,Other
15850,37897,36816,Prequel
15851,37908,5507,Other


In [9]:
valid_relations = ['Sequel', 'Prequel']

In [10]:
g = ig.Graph.DataFrame(related_series_df.loc[lambda x: x['relationship'].isin(valid_relations)])

In [31]:
edges = related_series_df.loc[lambda x: x['relationship'].isin(valid_relations)]
vertices = pd.DataFrame.from_dict({'anime_id': list(set(related_series_df['target']) | set(anime['anime_id']))})
g = ig.Graph.DataFrame(edges = edges, vertices = vertices)

In [32]:
clusters = g.components(mode='weak')

In [33]:
clusters.summary()

'Clustering with 14527 elements and 11992 clusters'

In [34]:
cluster = clusters.membership

In [35]:
vertices = [x.attributes()['name'] for x in clusters.graph.vs]

In [36]:
series = pd.DataFrame({'anime_id': vertices, 'series_id': cluster})

In [37]:
series

Unnamed: 0,anime_id,series_id
0,0,0
1,1,1
2,5,2
3,6,3
4,7,4
...,...,...
14522,37905,11988
14523,37908,11989
14524,37914,11990
14525,37915,8883
