# Assigns a unique id for each user and item
* The ids are guaranteed to be consecutive
* We drop users and items that have no ratings

In [1]:
import os
import random

import pandas as pd
from tqdm import tqdm

In [2]:
source_dir = "../../data/raw_data"

In [3]:
outdir = "../../data/processed_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [4]:
user_anime_lists = pd.concat(
    [
        pd.read_csv(
            os.path.join(source_dir, "user_explicit_lists.csv"), keep_default_na=False
        ),
        pd.read_csv(
            os.path.join(source_dir, "user_implicit_lists.csv"), keep_default_na=False
        ),
    ],
    ignore_index=True,
)

In [5]:
items = pd.DataFrame()
shuffled_items = list(sorted(set(user_anime_lists["anime_id"])))
random.shuffle(shuffled_items)
items["anime_id"] = shuffled_items
items["uid"] = items.index
items.to_csv(os.path.join(outdir, "anime_to_uid.csv"), index=False)

In [6]:
usernames = pd.DataFrame()
shuffled_usernames = list(sorted(set(user_anime_lists["username"])))
random.shuffle(shuffled_usernames)
usernames["username"] = shuffled_usernames
usernames["uid"] = usernames.index
usernames.to_csv(os.path.join(outdir, "username_to_uid.csv"), index=False)

In [7]:
anime_to_uid = items.set_index("anime_id")["uid"].to_dict()
username_to_uid = usernames.set_index("username")["uid"].to_dict()

In [8]:
def process(file):
    with open(os.path.join(source_dir, file), "r") as in_file, open(
        os.path.join(outdir, file), "w"
    ) as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                out_file.write("user,item,rating\n")
                continue
            username, anime_id, my_score = line.strip().split(",")
            username_uid = username_to_uid[username]
            anime_uid = anime_to_uid[int(anime_id)]
            out_file.write(f"{username_uid},{anime_uid},{my_score}\n")

In [9]:
process("user_explicit_lists.csv")

277779930it [02:03, 2254395.97it/s]


In [10]:
process("user_implicit_lists.csv")

113729114it [00:52, 2162321.61it/s]
