# Process Data
* Varous data cleaning tasks

In [None]:
import datetime
import glob
import os
import shutil
import subprocess
import time

import numpy as np
import papermill as pm

In [None]:
data_path = "../../data/papermill/process_data"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

# Helpers for spawning notebooks

In [None]:
def run_background_notebook(input, output, parameters):
    cmdlist = [
        "papermill",
        input,
        output,
    ]
    for k, v in parameters.items():
        cmdlist += ["-p", k, str(v)]
    return subprocess.Popen(
        cmdlist, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    )

In [None]:
def run_notebook(input, output=None, parameters=None, background=False):
    if output is None:
        output = input
    input = f"{input}.ipynb"
    output = os.path.join(data_path, f"{output}.ipynb")
    parameters = parameters if parameters is not None else dict()
    if background:
        return run_background_notebook(input, output, parameters)
    else:
        return pm.execute_notebook(input, output, parameters=parameters)

In [None]:
def run_notebooks(script):
    for media in ["manga", "anime"]:
        run_notebook(script, f"{script}{media.capitalize()}", {"media": media})

In [None]:
def run_parts(script, parts, params=None):
    start_time = time.time()
    print(f"spawning {len(parts)} processes")
    procs = []
    for part in parts:
        p = {"part": part}
        if params is not None:
            p |= params
        procs.append(
            run_notebook(
                script,
                output=f"{script}.{'.'.join(str(x) for x in p.values())}",
                parameters=p,
                background=True,
            )
        )
        time.sleep(1)
    for p in procs:
        p.wait()
        if p.returncode != 0:
            raise subprocess.CalledProcessError(p.returncode, p.args)
    print(f"processes finished in {int(round(time.time() - start_time))} seconds")

# Run notebooks

In [None]:
num_parts = 16

In [None]:
run_notebooks("ProcessMedia")

In [None]:
for x in np.array_split(range(num_parts), 4):
    run_parts("PruneMediaLists", x)

In [None]:
run_notebook("MapUids");

In [None]:
run_parts("ProcessMediaLists", range(num_parts))

In [None]:
for media in ["manga", "anime"]:
    for target_media in ["manga", "anime"]:
        run_notebook(
            "RelatedMedia",
            (
                "RelatedMedia"
                + f"{media.capitalize()}{target_media.capitalize()}"
                + ".ipynb"
            ),
            {"media": media, "target_media": target_media},
        )

In [None]:
run_parts("KnowledgeCutoff", range(num_parts))

In [None]:
for media in ["anime", "manga"]:
    for chunk in np.array_split(list(range(num_parts)), 4):
        run_parts("GenerateSplits", chunk, {"media": media})

In [None]:
run_notebook("CompressSplits")

In [None]:
# reclaim disk space
shutil.rmtree("../../data/raw_data")
for f in glob.glob("../../data/splits/*.h5"):
    os.remove(f)
for f in glob.glob("../../data/processed_data/*prune*.csv"):
    os.remove(f)
for f in glob.glob("../../data/processed_data/user_*_list.*.csv"):
    os.remove(f)

In [None]:
print("Success!")