# Assigns a unique id for each user and item
* The ids are guaranteed to be consecutive
* We drop users and items that have no ratings

In [1]:
import os

import pandas as pd
from tqdm import tqdm

In [2]:
source_dir = "../../data/raw_data"

In [3]:
outdir = "../../data/processed_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [4]:
user_anime_lists = pd.read_csv(os.path.join(source_dir, "user_anime_lists.csv"))

In [5]:
items = pd.DataFrame()
items["anime_id"] = list(set(user_anime_lists["anime_id"]))
items["uid"] = items.index
items.to_csv(os.path.join(outdir, "anime_to_uid.csv"), index=False)

In [6]:
usernames = pd.DataFrame()
usernames["username"] = list(set(user_anime_lists["username"]))
usernames["uid"] = usernames.index
usernames.to_csv(os.path.join(outdir, "username_to_uid.csv"), index=False)

In [7]:
anime_to_uid = items.set_index("anime_id")["uid"].to_dict()
username_to_uid = usernames.set_index("username")["uid"].to_dict()

In [8]:
with open(os.path.join(source_dir, "user_anime_lists.csv"), "r") as in_file, open(
    os.path.join(outdir, "user_anime_lists.csv"), "w"
) as out_file:
    header = False
    for line in tqdm(in_file):
        if not header:
            header = True
            out_file.write(line)
            continue
        username, anime_id, my_score = line.strip().split(",")
        username_uid = username_to_uid[username]
        anime_uid = anime_to_uid[int(anime_id)]
        out_file.write(f"{username_uid},{anime_uid},{my_score}\n")

85391079it [02:37, 540893.95it/s]
