# Combine AnimeLists
* Merges data from MAL and AniList and deduplicates users

In [None]:
import datetime
import logging
import os

import pandas as pd
from tqdm import tqdm

In [None]:
outdir = "../../data/raw_data"

In [None]:
# logging
logger = logging.getLogger("CombineAnimeLists")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [None]:
def get_user_to_timestamp(source):
    # finds the most recent timestamp for each user
    file = os.path.join(outdir, f"user_anime_list.{source}.csv")
    timestamps = {}
    header = False    
    with open(file, "r") as in_file:    
        for line in tqdm(in_file):
            if not header:
                header = True
                continue
            fields = line.strip().split(",")
            username = fields[0].lower()
            ts = fields[3]
            if username not in timestamps:
                timestamps[username] = ts
            else:
                timestamps[username] = max(timestamps[username], ts)
    return timestamps

In [None]:
def get_user_to_source(sources):
    # associate each user with their most recently updated source
    timestamps = [get_user_to_timestamp(x) for x in sources]
    user_to_source_ts = {}
    for i in range(len(sources)):
        ts = timestamps[i]
        for u in tqdm(ts):
            if u not in user_to_source_ts:
                user_to_source_ts[u] = (ts[u], sources[i])
            else:
                if ts[u] > user_to_source_ts[u][1]:
                    print("OVERWRITING", u, sources[i])
                    user_to_source_ts[u] = (ts[u], sources[i])
    user_to_source = {x: user_to_source_ts[x][1] for x in user_to_source_ts}                    
    return user_to_source

In [None]:
def combine_sources(sources):
    user_to_source = get_user_to_source(sources)
    # for each user, write down the list from their preferred source
    outfile = os.path.join(outdir, f"user_anime_list.csv")
    with open(outfile, "w") as out_file:    
        for i in range(len(sources)):
            file = os.path.join(outdir, f"user_anime_list.{sources[i]}.csv")
            with open(file, "r") as in_file:
                header = False
                for line in tqdm(in_file):
                    if not header:
                        header = True
                        if i == 0:
                            out_file.write(line)
                        continue
                    fields = line.strip().split(",")
                    username = fields[0].lower()
                    if user_to_source[username] != sources[i]:
                        continue
                    out_file.write(line)

In [None]:
combine_sources(["MAL", "AniList"])