In [None]:
import datetime
import glob
import os
from functools import cache

import pandas as pd
from filelock import FileLock
from tqdm import tqdm
from transformers import pipeline

## Mappings

In [None]:
OUTPUT_HEADER = [
    "source",
    "medium",
    "userid",
    "mediaid",
    "status",
    "rating",
    "updated_at",
    "created_at",
    "started_at",
    "finished_at",
    "update_order",
    "progress",
    "repeat_count",
    "priority",
    "sentiment",
    "sentiment_score",
    "owned",
]

In [None]:
SOURCE_MAP = {"mal": 0, "anilist": 1, "kitsu": 2, "animeplanet": 3}

In [None]:
STATUS_MAP = {
    "rewatching": 7,
    "completed": 6,
    "currently_watching": 5,
    "planned": 4,
    "on_hold": 3,
    "dropped": 2,
    "wont_watch": 1,
    "none": 0,
}

In [None]:
MEDIUM_MAP = {"manga": 0, "anime": 1}

In [None]:
def parse_int(x, map={}, allow_neg=False):
    if x in map:
        return map[x]
    x = int(x)
    if not allow_neg:
        assert x >= 0
    return x

In [None]:
def filter_negative_ts(x):
    # sometimes the api returns a negative timestamp. TODO fix upstream
    if '-' in x:
        return "0"
    return x

In [None]:
@cache
def get_media_progress(medium):
    df = pd.read_csv(os.path.join(MEDIA_DIR, f"{medium}.csv"))
    if medium == "anime":
        return {"episodes": df.set_index(f"{medium}_id")["num_episodes"].to_dict()}
    elif medium == "manga":
        return {
            "volumes": df.set_index(f"{medium}_id")["num_volumes"].to_dict(),
            "chapters": df.set_index(f"{medium}_id")["num_chapters"].to_dict(),
        }
    else:
        assert False


def get_completion(x, xmax):
    if xmax == 0:
        return 0.0
    else:
        return min(1.0, x / xmax)


def get_progress(medium, uid, progress, progress_volumes):
    df = get_media_progress(medium)
    if medium == "anime":
        return get_completion(progress, df["episodes"].get(uid, 0))
    elif medium == "manga":
        return max(
            get_completion(progress, df["chapters"].get(uid, 0)),
            get_completion(progress_volumes, df["volumes"].get(uid, 0)),
        )

In [None]:
SENTIMENT_MAP = {
    "positive": 3,
    "neutral": 2,
    "negative": 1,
    "none": 0,
}


def compute_sentiments(texts):
    sentiments = {}
    if not texts:
        return sentiments
    texts.sort(key=len)
    logger.info(f"Performing sentiment analysis on {len(texts)} texts")
    lock = FileLock("gpu.lock")
    with lock:
        # TODO finetune and calibrate this model on domain data
        modelname = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
        model = pipeline(model=modelname, truncation=True, device="cuda")
        model.tokenizer.model_max_length = 512
        outputs = model(texts, batch_size=16)
    for x, y in zip(texts, outputs):
        sentiments[x] = {
            "sentiment": y["label"],
            "score": y["score"],
        }
    sentiments[""] = {
        "sentiment": "none",
        "score": 0,
    }
    return sentiments

In [None]:
def process_score(score):
    score = float(score)
    if not (score >= 0 and score <= 10):
        logger.warning(f"invalid score {score}, replacing with 0")
        score = 0
    return score

## Source parsing

In [None]:
def preprocess(input_fn, header_fields, text_fields):
    logger.info(f"Sanitizing entries in {input_fn}")
    total_lines = 0
    total_texts = set()

    partition = input_fn.split(".")[-2]
    output_fn = input_fn + "~"
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = ",".join(header_fields) + "\n"
                    if line != correct_header:
                        logger.warning(
                            f"Replacing malformed header line {line.strip()} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    total_lines += 1
                    continue
                fields = line.strip().split(",")
                if len(fields) != len(header_fields):
                    logger.warning(
                        f"Deleting malformed line in user_{MEDIUM}_list.csv: {line} "
                    )
                    continue
                for tf in text_fields:
                    total_texts.add(fields[header_fields.index(tf)])
                out_file.write(line)
                total_lines += 1
        os.replace(output_fn, input_fn)
    return {
        "lines": total_lines,
        "texts": total_texts,
    }

In [None]:
def process_line(line, metadata):
    try:
        fields = parse_fields(line, metadata)
    except Exception as e:
        print(f"Error: could not parse {line}")
        raise e
    assert len(fields) == len(OUTPUT_HEADER)
    return ",".join(str(fields[x]) for x in OUTPUT_HEADER)

In [None]:
def process(infile, outfile, metadata):
    logger.info(f"processing entries in {infile}")
    with open(infile, "r") as in_file:
        with open(outfile, "w") as out_file:
            header = False
            for line in tqdm(in_file, total=metadata["lines"]):
                if not header:
                    header = True
                    out_file.write(",".join(OUTPUT_HEADER) + "\n")
                    continue
                out_file.write(process_line(line.strip(), metadata) + "\n")

In [None]:
def import_notebook(nb):
    cwd = os.getcwd()
    try:
        os.chdir(os.path.dirname(nb))
        script = os.path.basename(nb)
        %run $script
    finally:
        os.chdir(cwd)

In [None]:
import_notebook(f"./{SOURCE.capitalize()}.ipynb")