# Get Media
* We store metadata in `data/mal/media_facts`

In [None]:
import glob
import logging
import os
from hashlib import sha256

from tqdm import tqdm

In [None]:
PARTITION = 0
NUM_PARTITIONS = 1
PROXY_NUMBER = 0
TOKEN_NUMBER = 0

# Setup

In [None]:
def import_notebook(nb):
    cwd = os.getcwd()
    try:
        os.chdir(os.path.dirname(nb))
        script = os.path.basename(nb)
        %run $script
    finally:
        os.chdir(cwd)

In [None]:
import_notebook(f"../API/MalApi.ipynb")

In [None]:
data_path = f"../../../data/mal/media_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
name = f"get_media_relations"
logger = logging.getLogger(name)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler(f"{name}.{PARTITION}.log", "w+"),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

# Collect data

In [None]:
def get_media_ids(input_fn):
    item_ids = set()
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                idx = line.strip().split(",").index("uid")
                header = True
                continue
            item_id = line.strip().split(",")[idx]
            item_ids.add(int(item_id))
    return item_ids

In [None]:
def portable_hash(x):
    return int(sha256(str(x).encode("utf-8")).hexdigest(), 16)

In [None]:
def get_media_partition(media):
    media_ids = set()
    for file in glob.glob(f"../user_media_facts/user_{media}_list.*.csv"):
        media_ids |= get_media_ids(file)
    media_ids = sorted(
        [x for x in media_ids if portable_hash(x) % NUM_PARTITIONS == PARTITION]
    )
    if not media_ids:
        timeout = 600
        logger.info(f"Could not find {media} media_ids. Retrying in {timeout} seconds")
        return get_media_partition(media)
    return media_ids

In [None]:
try:
    while True:
        for medium in ["manga", "anime"]:
            media_ids = get_media_partition(medium)
            logger.info(f"Collecting data for {len(media_ids)} {medium} entries")
            for name, fn in zip(
                [medium, f"{medium}_relations"],
                [get_media_details, get_media_relations],
            ):
                data = [fn(x, medium) for x in tqdm(media_ids)]
                data.to_csv(f"{name}.{PARTITION}.csv", index=False)
                logger.info(f"Wrote data for {len(data)} {name} entries")
except Exception as e:
    logger.info(str(e))