# Get Media
* We store metadata in `data/mal/media_facts`

In [None]:
import glob
import logging
import os
import random
import time
import traceback
from hashlib import sha256

from filelock import FileLock
from tqdm import tqdm

In [None]:
PARTITION = 0
NUM_PARTITIONS = 1
PROXY_NUMBER = 0

# Setup

In [None]:
def import_notebook(nb):
    cwd = os.getcwd()
    try:
        os.chdir(os.path.dirname(nb))
        script = os.path.basename(nb)
        %run $script
    finally:
        os.chdir(cwd)

In [None]:
import_notebook(f"../API/MalWebApi.ipynb")

In [None]:
data_path = f"../../../data/mal/media_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
name = "get_media"
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler(f"{name}.{PARTITION}.log", "w+"),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [None]:
LOCK_FILE = f"../../mal_media.lock"
LOCK = FileLock(LOCK_FILE, timeout=-1)

# Collect data

In [None]:
def get_media_ids(input_fn):
    item_ids = set()
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                idx = line.strip().split(",").index("uid")
                header = True
                continue
            item_id = line.strip().split(",")[idx]
            item_ids.add(int(item_id))
    return item_ids

In [None]:
def portable_hash(x):
    return int(sha256(str(x).encode("utf-8")).hexdigest(), 16)

In [None]:
def get_media_partition(media):
    media_ids = set()
    for file in glob.glob(f"../user_media_facts/user_{media}_list.*.csv"):
        media_ids |= get_media_ids(file)
    media_ids = sorted(
        [x for x in media_ids if portable_hash(x) % NUM_PARTITIONS == PARTITION]
    )
    if not media_ids:
        timeout = 600
        logger.info(f"Could not find {media} media_ids. Retrying in {timeout} seconds")
        time.sleep(timeout)
        return get_media_partition(media)
    return media_ids

In [None]:
@LOCK
def merge(fn, df, idcol):
    logger.info(f"Saving {len(df)} entries to {fn}")
    if df.empty:
        return
    ts_col = "accessed_at"
    df[ts_col] = int(time.time())
    if os.path.exists(fn):
        # if the api is down, then fallback to the last good state
        saved_state = pd.read_csv(fn)
        max_fallback_secs = 30 * 24 * 3600
        saved_state = saved_state.query(f"{ts_col} > {time.time() - max_fallback_secs}")
    else:
        saved_state = pd.DataFrame()
    df = pd.concat([df, saved_state])
    df = df.groupby(idcol).first().reset_index()
    for x in ["source_id", "target_id", idcol]:
        if x in df.columns:
            df[x] = df[x].astype(int)
    df.to_csv(fn + "~", index=False)
    os.rename(fn + "~", fn)

In [None]:
try:
    mediums = ["anime", "manga"]
    random.shuffle(mediums)
    while True:
        for medium in mediums:
            media_ids = get_media_partition(medium)
            random.shuffle(media_ids)
            logger.info(f"Collecting data for {len(media_ids)} {medium} entries")
            details = []
            relations = []
            details_fn = f"{medium}.{PARTITION}.csv"
            relations_fn = f"{medium}_relations.{PARTITION}.csv"
            for i, x in enumerate(tqdm(media_ids)):
                d, r = get_media_facts(x, medium)
                details.append(d)
                relations.append(r)
                if i % 500 == 0 or i == len(media_ids) - 1:
                    merge(details_fn, pd.concat(details), f"{medium}_id")
                    merge(relations_fn, pd.concat(relations), "source_id")
                    details = []
                    relations = []
            assert len(details) == 0 and len(relations) == 0
except Exception as e:
    logger.info(traceback.format_exc())
    logger.info(str(e))