# Get Media
* Stores item information in `data/{SOURCE}/media_facts`

In [None]:
import glob
import logging
import os
import random
import sys
import time
import traceback
from hashlib import sha256

import pandas as pd
from filelock import FileLock
from tqdm import tqdm

In [None]:
PARTITION = 0
NUM_PARTITIONS = 1
PROXY_NUMBER = 0
NUM_PROXIES = 1
SOURCE = ""

# Setup

In [None]:
def configure_logging(logfile):
    name = f"get_media_{SOURCE}"
    logger = logging.getLogger()
    logger.handlers.clear()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    )
    for stream in [
        logging.handlers.RotatingFileHandler(
            logfile, "w+", maxBytes=1000000, backupCount=1
        ),
    ]:
        stream.setFormatter(formatter)
        logger.addHandler(stream)

In [None]:
sys.path.append("..")
from API import anilist_api, animeplanet_api, api_setup, kitsu_api, mal_web_api

if SOURCE == "mal":
    api = mal_web_api
    IDCOL = "uid"
elif SOURCE == "anilist":
    api = anilist_api
    IDCOL = "anilistid"
elif SOURCE == "kitsu":
    api = kitsu_api
    IDCOL = "kitsuid"
elif SOURCE == "animeplanet":
    api = animeplanet_api
    IDCOL = "url"
else:
    assert False
PROXIES = api_setup.load_proxies(PROXY_NUMBER, NUM_PROXIES)
SESSION = api.make_session(PROXIES, 1)

In [None]:
data_path = f"../../../data/{SOURCE}/media_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
configure_logging(f"get_media.{PARTITION}.log")

In [None]:
LOCK_FILE = f"../../{SOURCE}_media.lock"
LOCK = FileLock(LOCK_FILE, timeout=-1)

# Collect data

In [None]:
def get_media_ids(input_fn):
    item_ids = set()
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                idx = line.strip().split(",").index(IDCOL)
                header = True
                continue
            item_id = line.strip().split(",")[idx]
            item_ids.add(item_id)
    return item_ids

In [None]:
def portable_hash(x):
    return int(sha256(str(x).encode("utf-8")).hexdigest(), 16)

In [None]:
def get_media_partition(media):
    media_ids = set()
    for file in glob.glob(f"../user_media_facts/user_{media}_list.*.csv"):
        media_ids |= get_media_ids(file)
    media_ids = sorted(
        [x for x in media_ids if portable_hash(x) % NUM_PARTITIONS == PARTITION]
    )
    if not media_ids:
        timeout = 600
        logging.info(f"Could not find {media} media_ids. Retrying in {timeout} seconds")
        time.sleep(timeout)
        return get_media_partition(media)
    return media_ids

In [None]:
@LOCK
def merge(fn, df, idcol):
    logging.info(f"Saving {len(df)} entries to {fn}")
    if df.empty:
        return
    ts_col = "accessed_at"
    df[ts_col] = int(time.time())
    # set column dtypes
    intcols = ["uid", "anilistid", "kitsuid"]
    if SOURCE != "animeplanet":
        intcols += ["source_id", "target_id"]
    for x in intcols:
        if x in df.columns:
            df[x] = df[x].astype(int)
    if os.path.exists(fn):
        # if the api is down, then fallback to the last good state
        saved_state = pd.read_csv(fn, dtype={x: int for x in intcols})
        max_fallback_secs = 30 * 24 * 3600
        saved_state = saved_state.query(f"{ts_col} > {time.time() - max_fallback_secs}")
        saved_state = saved_state.loc[lambda x: ~x[idcol].isin(df[idcol])]
    else:
        saved_state = pd.DataFrame()
    df = pd.concat([df, saved_state])
    df.to_csv(fn + "~", index=False)
    os.rename(fn + "~", fn)

In [None]:
try:
    mediums = ["anime", "manga"]
    for medium in mediums:
        media_ids = get_media_partition(medium)
        random.shuffle(media_ids)
        logging.info(f"Collecting data for {len(media_ids)} {medium} entries")
        details = []
        relations = []
        details_fn = f"{medium}.{PARTITION}.csv"
        relations_fn = f"{medium}_relations.{PARTITION}.csv"
        for i, x in enumerate(tqdm(media_ids)):
            d, r = api.get_media_facts(SESSION, x, medium)
            details.append(d)
            relations.append(r)
            if i % 500 == 0 or i == len(media_ids) - 1:
                merge(details_fn, pd.concat(details), IDCOL)
                merge(relations_fn, pd.concat(relations), "source_id")
                details = []
                relations = []
        assert len(details) == 0 and len(relations) == 0
except Exception as e:
    logging.info(traceback.format_exc(limit=10))
    logging.info(str(e))
    while True:
        time.sleep(3600)