# Import Lists
* Parses media lists from different websites and converts them into a unified format

In [None]:
import glob
import os
import shutil
import subprocess
import time

import papermill as pm

In [None]:
data_path = "../../../data/papermill/import_datasets/import_lists"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)


def run_background_notebook(input, output, parameters):
    cmdlist = [
        "papermill",
        input,
        output,
    ]
    for k, v in parameters.items():
        cmdlist += ["-p", k, str(v)]
    return subprocess.Popen(
        cmdlist, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
    )


def run_notebook(input, output=None, parameters=None, background=False):
    if output is None:
        output = input
    output = os.path.join(data_path, output)
    parameters = parameters if parameters is not None else dict()
    if background:
        return run_background_notebook(input, output, parameters)
    else:
        return pm.execute_notebook(input, output, parameters=parameters)

In [None]:
# import lists in parallel
for dataset in ["training", "streaming", "test"]:
    if not os.path.exists(f"../../../data/raw_{dataset}_data"):
        continue
    NUM_PARTS = 8
    start_time = time.time()
    print(f"spawning {NUM_PARTS} processes")
    procs = []
    for part in range(NUM_PARTS):
        time.sleep(1)
        procs.append(
            run_notebook(
                "ImportLists.ipynb",
                output=f"ImportLists.{dataset}.{part}.ipynb",
                parameters={"PART": part, "NUM_PARTS": NUM_PARTS, "DATASET": dataset},
                background=True,
            )
        )
    for p in procs:
        p.wait()
        if p.returncode != 0:
            raise subprocess.CalledProcessError(p.returncode, p.args)
    print(f"processes finished in {int(round(time.time() - start_time))} seconds")

In [None]:
if os.path.exists(f"../../../data/training_data"):
    run_notebook("TrainingData.ipynb");

In [None]:
for dataset in ["streaming", "test"]:
    if not os.path.exists(f"../../../data/{dataset}_data"):
        continue
    run_notebook(
        "TestData.ipynb",
        output=f"{dataset.capitalize()}Data.ipynb",
        parameters={"dataset": dataset},
    );

In [None]:
if os.path.exists(f"../../../data/test_data"):
    run_notebook("CausalData.ipynb");

In [None]:
run_notebook("Finalize.ipynb");