# Importing datasets
* Converts data obtained form the MyAnimeList API into a shared format

In [1]:
import os
import pickle

import pandas as pd

In [2]:
source_dir = "../../data/mal/"

In [3]:
outdir = "../../data/cleaned_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

## Get anime facts

In [5]:
anime = pd.read_csv(os.path.join(source_dir, "anime_facts/anime.csv"))

In [6]:
anime

Unnamed: 0.1,Unnamed: 0,anime_id,title,genres,source,related_anime
0,0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",original,"[{'anime_id': 5, 'relation': 'side_story'}, {'..."
1,1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Drama', 'Mystery', 'Sci-Fi', 'Space']",original,"[{'anime_id': 1, 'relation': 'parent_story'}]"
2,2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",manga,"[{'anime_id': 4106, 'relation': 'side_story'}]"
3,3,7,Witch Hunter Robin,"['Action', 'Mystery', 'Police', 'Supernatural'...",original,[]
4,4,8,Bouken Ou Beet,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",manga,"[{'anime_id': 1123, 'relation': 'sequel'}]"
...,...,...,...,...,...,...
22546,22546,49002,Milky party THE ANIMATION,"['Ecchi', 'Hentai']",digital_manga,[]
22547,22547,49003,TabiHani,"['Action', 'Comedy', 'Drama', 'Ecchi', 'Fantas...",original,[]
22548,22548,49004,QUI,['Music'],original,[]
22549,22549,49005,27th,['Music'],original,[]


In [5]:
anime = anime[["anime_id", "title", "genre", "type", "related"]].rename(
    {"genre": "genres", "related": "related_anime"}, axis=1
)

In [6]:
def process_related(related):
    related = eval(related)
    return [
        {"anime_id": entry["mal_id"], "relation": relation}
        for relation in related
        for entry in related[relation]
        if entry["type"] == "anime"
    ]

In [7]:
anime["related_anime"] = anime["related_anime"].apply(process_related)

In [8]:
anime = anime.sort_values(by="anime_id")

In [9]:
anime.to_csv(os.path.join(outdir, "anime.csv"), index=False)

## Get users

In [4]:
users = pd.read_csv(os.path.join(source_dir, "user_facts/usernames.txt"), header=None).rename({0: 'username'}, axis=1)

In [5]:
users[["username"]].to_csv(os.path.join(outdir, "usernames.csv"), index=False)

## Get User Lists

In [6]:
# This may take several minutes to finish
anime_lists = pd.read_csv(os.path.join(source_dir, "user_anime_facts/user_anime_list.csv"))

In [7]:
anime_lists = anime_lists[["username", "anime_id", "my_score"]]
anime_lists = anime_lists.loc[lambda x: x["my_score"] != 0].reset_index(drop=True)

In [8]:
anime_lists.to_pickle(os.path.join(outdir, "user_anime_lists.pkl"))

In [9]:
# This may take several minutes to finish
anime_lists.to_csv(os.path.join(outdir, "user_anime_lists.csv"), index=False)

## Write source

In [10]:
with open(os.path.join(outdir, "source.txt"), "w") as f:
    f.write("Dataset obtained using the MyAnimeList API")