# Relabel IDs
* Anonymizes the dataset by assigning a unique id for each user and item
* The ids are guaranteed to be consecutive
* We drop users and items that have no ratings

In [None]:
import os
import random

import pandas as pd
from tqdm import tqdm

In [None]:
outdir = "../../data/processed_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [None]:
user_anime_lists = pd.read_csv(os.path.join(outdir, "user_anime_list.csv"), keep_default_na=False)

In [None]:
items = pd.DataFrame()
shuffled_items = list(sorted(set(user_anime_lists["animeid"])))
random.shuffle(shuffled_items)
items["animeid"] = shuffled_items
items["uid"] = items.index
items.to_csv(os.path.join(outdir, "anime_to_uid.csv"), index=False)

In [None]:
usernames = pd.DataFrame()
shuffled_usernames = list(sorted(set(user_anime_lists["username"])))
random.shuffle(shuffled_usernames)
usernames["username"] = shuffled_usernames
usernames["uid"] = usernames.index
usernames.to_csv(os.path.join(outdir, "username_to_uid.csv"), index=False)

In [None]:
anime_to_uid = items.set_index("animeid")["uid"].to_dict()
username_to_uid = usernames.set_index("username")["uid"].to_dict()

In [None]:
def process(file):
    file = os.path.join(outdir, file)
    with open(file, "r") as in_file, open(
        file + "~", "w"
    ) as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                out_file.write(line)
                continue
            fields = line.strip().split(",")
            fields[0] = str(username_to_uid[fields[0]])
            fields[1] = str(anime_to_uid[int(fields[1])])
            out_file.write(",".join(fields)+"\n")
    os.rename(file+"~", file)

In [None]:
process("user_anime_list.csv")