In [None]:
import sys
sys.path.append("..")

In [None]:
import json
from pathlib import Path
import scraper
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
series = 1336037
match = 1336043

# sanity test
match_deets = scraper.get_match_detail(series, match)
series_deets = scraper.get_series_meta(series)
print(match_deets["match"]["slug"])
print(series_deets["slug"])

In [None]:
# The series API allows us to select most of the series IDs, however it seems to be buggy. 
# You can only get some series (mostly major mixed format ones, e.g. The Ashes) by passing in gibberish.

format_type = scraper.T20I
format_name = "T20I"
ids = scraper.get_series_ids(format_type)
ids

In [None]:
# save as we go
def save_series(id, base_path):
    base_path = Path(base_path) if isinstance(base_path, str) else base_path
    try:
        series_meta = scraper.get_series_meta(id)
        if any(class_id in [str(x) for x in series_meta["classId"]] for class_id in "1 2 3".split()):
            seasons_meta = scraper.get_seasons_meta(id)
            for season_meta in seasons_meta:
                try:
                    season_path = base_path / str(season_meta["year"]) / "{0[id]}-{0[slug]}".format(series_meta)
                    season_path.mkdir(exist_ok=True, parents=True)
                    print(season_path)
                    with open(season_path / "season_meta.json", "w") as f:
                        json.dump(season_meta, f,ensure_ascii=False, indent=4)
                        print("saved {}".format(season_path / "season_meta.json"))

                    with open(season_path / "series_meta.json", "w") as f:
                        json.dump(series_meta, f, ensure_ascii=False, indent=4)
                        print("saved {}".format(season_path / "series_meta.json"))
                    
                except:
                    print("could not save {}".format(season_path))

    except json.JSONDecodeError:
        print("not json")

In [None]:
data_dir = Path("../data")
format_dir = data_dir / format_name

# bit hacky but to avoid re-downloading
not_done = [x for x in ids if not list(format_dir.rglob("{}-*/series_meta.json".format(x)))]
print(len(not_done))

chunk_size = 100
for i in range(0, len(not_done), chunk_size):
    end_i = i + chunk_size    
    chunk = not_done[i:end_i]
    with ThreadPoolExecutor() as pool:
        for id in chunk:
            pool.submit(save_series, id, data_dir)

    print("done {}".format(i + chunk_size))