# Relabel IDs
* Anonymizes the dataset by assigning a unique id for each user and item
* The ids are guaranteed to be consecutive
* We drop users and items that have no ratings

In [None]:
import os
import random

import pandas as pd
from tqdm import tqdm

In [None]:
outdir = "../../data/processed_data"
if not os.path.exists(outdir):
    os.mkdir(outdir)

In [None]:
def get_unique_values(media, col):
    values = set()
    file = os.path.join(outdir, f"user_{media}_list.csv")
    with open(file, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            fields = line.strip().split(",")
            if not header:
                header = True
                idx = fields.index(col)
                continue
            values.add(fields[idx])
    return list(values)

In [None]:
def shuffle_media_ids(media):
    items = pd.DataFrame()
    shuffled_items = get_unique_values(media, f"{media}id")
    random.shuffle(shuffled_items)
    items[f"{media}id"] = shuffled_items
    items["uid"] = items.index
    items.to_csv(os.path.join(outdir, f"{media}_to_uid.csv"), index=False)
    return items.set_index(f"{media}id")["uid"].to_dict()

In [None]:
def shuffle_usernames():
    usernames = pd.DataFrame()
    shuffled_usernames = list(
        set(get_unique_values("anime", "username"))
        | set(get_unique_values("manga", "username"))
    )
    random.shuffle(shuffled_usernames)
    usernames["username"] = shuffled_usernames
    usernames["uid"] = usernames.index
    usernames.to_csv(os.path.join(outdir, "username_to_uid.csv"), index=False)    
    return usernames.set_index("username")["uid"].to_dict()

In [None]:
def process(file, media, username_to_uid, media_to_uid):
    file = os.path.join(outdir, file)
    with open(file, "r") as in_file, open(file + "~", "w") as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                user_col = line.strip().split(",").index("username")
                item_col = line.strip().split(",").index(f"{media}id")
                out_file.write(line)
                continue
            fields = line.strip().split(",")
            fields[user_col] = str(username_to_uid[fields[user_col]])
            fields[item_col] = str(media_to_uid[fields[item_col]])
            out_file.write(",".join(fields) + "\n")
    os.rename(file + "~", file)

In [None]:
username_to_uid = shuffle_usernames()
for media in ["anime", "manga"]:
    media_to_uid = shuffle_media_ids(media)
    process(f"user_{media}_list.csv", media, username_to_uid, media_to_uid)