# Import Datasets
* Parses the data from different websites and converts them into a unified format

In [None]:
import glob
import os
import shutil
import subprocess
import time

import papermill as pm

In [None]:
data_path = "../../data/papermill/import_datasets"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)

In [None]:
def run_background_notebook(input, output, parameters):
    cmdlist = [
        "papermill",
        input,
        output,
    ]
    for k, v in parameters.items():
        cmdlist += ["-p", k, str(v)]
    return subprocess.Popen(
        cmdlist, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    )

In [None]:
def run_notebook(input, output=None, parameters=None, background=False):
    if output is None:
        output = input
    output = os.path.join(data_path, output)
    parameters = parameters if parameters is not None else dict()
    if background:
        return run_background_notebook(input, output, parameters)
    else:
        return pm.execute_notebook(input, output, parameters=parameters)

In [None]:
ALL_MEDIUMS = ["manga", "anime"]
parts = 16

In [None]:
for medium in ALL_MEDIUMS:
    run_notebook("ImportMedia.ipynb", parameters={"medium": medium, "parts": parts})

In [None]:
run_notebook("AnimeplanetMappings.ipynb");

In [None]:
for source in ["mal", "anilist", "kitsu", "animeplanet"]:
    for medium in ALL_MEDIUMS:
        num_parts = len(
            glob.glob(f"../../data/{source}/user_media_facts/user_{medium}_list.*.csv")
        )
        start_time = time.time()
        print(f"spawning {num_parts} {source} {medium} processes")
        procs = []
        for part in range(num_parts):
            time.sleep(1)
            procs.append(
                run_notebook(
                    "ImportLists.ipynb",
                    output=f"ImportLists.{source}.{medium}.{part}.ipynb",
                    parameters={"MEDIUM": medium, "SOURCE": source, "PART": part},
                    background=True,
                )
            )
        for p in procs:
            p.wait()
            if p.returncode != 0:
                raise subprocess.CalledProcessError(p.returncode, p.args)
        print(f"processes finished in {int(round(time.time() - start_time))} seconds")

In [None]:
run_notebook("CombineMediaLists.ipynb", parameters={"parts": parts});

In [None]:
print("Success!")