# Web Scripts
* This notebook will run indefinitely. You must manually kill this notebook after a sufficient dataset has been collected.

In [None]:
import glob
import json
import os
import subprocess
import time

from ratelimit import limits, sleep_and_retry

In [None]:
cwd = os.getcwd()

In [None]:
def get_proxies(source, task):
    proxyfn = "../../../environment/proxies/proxy_map.txt"
    if os.path.exists(proxyfn):
        num_proxies = 0
        proxies = []
        with open(proxyfn) as f:
            for line in f:
                if not line.strip():
                    continue
                fields = line.strip().split(",")
                if fields[0] == source:
                    if fields[1] == task:
                        proxies.append(num_proxies)
                    num_proxies += 1
        shared_ip_ratelimit = False
    else:
        num_proxies = 1
        proxies = [0]
        shared_ip_ratelimit = True
    return proxies, num_proxies, shared_ip_ratelimit

In [None]:
@sleep_and_retry
@limits(calls=1, period=5)
def spawn_notebook(script, partition, num_partitions, proxy, num_proxies, params):
    cmdlist = [
        "papermill",
        f"{cwd}/{script}.ipynb",
        "/dev/null",
        "-p",
        "PARTITION",
        str(partition),
        "-p",
        "NUM_PARTITIONS",
        str(num_partitions),
        "-p",
        "PROXY_NUMBER",
        str(proxy),
        "-p",
        "NUM_PROXIES",
        str(num_proxies),
    ]
    for k, v in params.items():
        cmdlist += ["-p", k, str(v)]
    print(cmdlist)
    return subprocess.Popen(cmdlist)

In [None]:
ALL_SOURCES = ["animeplanet", "kitsu", "anilist", "mal"]
PROXIES = {x: get_proxies(x, "lists") for x in ALL_SOURCES}
procs = {}

In [None]:
def spawn_notebooks(script, source, task, params):
    proxies, num_proxies, shared_ip_ratelimit = get_proxies(source, task)
    if shared_ip_ratelimit:
        params["RATELIMIT_MULT"] = "3"
    for i in range(len(proxies)):
        args = (script, i, len(proxies), proxies[i], num_proxies, params)
        procs[json.dumps(args)] = spawn_notebook(*args)

In [None]:
spawn_notebooks("GetMedia", "malweb", "media", {"SOURCE": "mal"})

In [None]:
spawn_notebooks("GetMedia", "anilist", "media", {"SOURCE": "anilist"})

In [None]:
spawn_notebooks("GetMedia", "kitsu", "media", {"SOURCE": "kitsu"})

In [None]:
spawn_notebooks("GetMedia", "animeplanet", "media", {"SOURCE": "animeplanet"})

In [None]:
spawn_notebooks("GetMalUsersFromId", "malweb", "id", {})

In [None]:
spawn_notebooks("GetAnimeplanetUsersFromRecent", "animeplanet", "recent", {})

In [None]:
spawn_notebooks("GetAnimeplanetUsersFromFriends", "animeplanet", "friends", {})

In [None]:
while True:
    time.sleep(1)
    for k, v in procs.items():
        if v.poll() is not None:
            # spawn a new process to replace the finished one
            procs[k] = spawn_notebook(*json.loads(k))