In [1]:
import sys
sys.path.append("..")

In [2]:
import json
from pathlib import Path
import scraper
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
series = 1336037
match = 1336043

# sanity test
match_deets = scraper.get_match_detail(series, match)
series_deets = scraper.get_series_meta(series)
print(match_deets["match"]["slug"])
print(series_deets["slug"])

england-vs-australia-1st-test
australia-in-england-2023


In [7]:
# The series API allows us to select most of the series IDs, however it seems to be buggy. 
# You can only get some series (mostly major mixed format ones, e.g. The Ashes) by passing in gibberish.

format_type = scraper.T20I
format_name = "T20I"
ids = scraper.get_series_ids(format_type)
ids

['19283',
 '19321',
 '19731',
 '19605',
 '19305',
 '19586',
 '19308',
 '19679',
 '19636',
 '19405',
 '19239',
 '19311',
 '19296',
 '19241',
 '19568',
 '19585',
 '19616',
 '19578',
 '19507',
 '19480',
 '19545',
 '19583',
 '19567',
 '19314',
 '19555',
 '19895',
 '19896',
 '19631',
 '19632',
 '19599',
 '19709',
 '19711',
 '19779',
 '19707',
 '19836',
 '19838',
 '19837',
 '19814',
 '19756',
 '19316',
 '19485',
 '19751',
 '19651',
 '19780',
 '19743',
 '19807',
 '19281',
 '19772',
 '19766',
 '19767',
 '20878',
 '21163',
 '21116',
 '21070',
 '21022',
 '20742',
 '20894',
 '20744',
 '20931',
 '21047',
 '20923',
 '20933',
 '20999',
 '1333912',
 '20948',
 '20958',
 '20944',
 '20771',
 '20627',
 '20677',
 '20606',
 '20895',
 '20773',
 '20919',
 '20319',
 '19452',
 '19384',
 '19487',
 '19466',
 '19471',
 '19422',
 '19332',
 '19368',
 '19372',
 '19377',
 '19389',
 '19060',
 '19401',
 '19382',
 '19301',
 '19325',
 '19287',
 '19276',
 '19176',
 '19263',
 '19254',
 '18776',
 '19202',
 '19149',
 '19172'

In [5]:
# save as we go
def save_series(id, base_path):
    base_path = Path(base_path) if isinstance(base_path, str) else base_path
    try:
        series_meta = scraper.get_series_meta(id)
        if any(class_id in [str(x) for x in series_meta["classId"]] for class_id in "1 2 3".split()):
            seasons_meta = scraper.get_seasons_meta(id)
            for season_meta in seasons_meta:
                try:
                    season_path = base_path / str(season_meta["year"]) / "{0[id]}-{0[slug]}".format(series_meta)
                    season_path.mkdir(exist_ok=True, parents=True)
                    print(season_path)
                    with open(season_path / "season_meta.json", "w") as f:
                        json.dump(season_meta, f,ensure_ascii=False, indent=4)
                        print("saved {}".format(season_path / "season_meta.json"))

                    with open(season_path / "series_meta.json", "w") as f:
                        json.dump(series_meta, f, ensure_ascii=False, indent=4)
                        print("saved {}".format(season_path / "series_meta.json"))
                    
                except:
                    print("could not save {}".format(season_path))

    except json.JSONDecodeError:
        print("not json")

In [8]:
data_dir = Path("../data")
format_dir = data_dir / format_name

# bit hacky but to avoid re-downloading
not_done = [x for x in ids if not list(data_dir.rglob("{}-*/series_meta.json".format(x)))]
print(len(not_done))

chunk_size = 100
for i in range(0, len(not_done), chunk_size):
    end_i = i + chunk_size    
    chunk = not_done[i:end_i]
    with ThreadPoolExecutor() as pool:
        for id in chunk:
            pool.submit(save_series, id, data_dir)

    print("done {}".format(i + chunk_size))

510
../data/yoza/2019/19405-afg-v-wi-t20is-2019-20../data/yoza/2019/19308-west-indies-in-ind-t20is-2019-20
../data/yoza/2020/19321-india-in-nz-t20is-2019-20

saved ../data/yoza/2020/19321-india-in-nz-t20is-2019-20/season_meta.json
saved ../data/yoza/2019/19308-west-indies-in-ind-t20is-2019-20/season_meta.json
saved ../data/yoza/2019/19405-afg-v-wi-t20is-2019-20/season_meta.json
saved ../data/yoza/2019/19308-west-indies-in-ind-t20is-2019-20/series_meta.json
saved ../data/yoza/2020/19321-india-in-nz-t20is-2019-20/series_meta.json
saved ../data/yoza/2019/19405-afg-v-wi-t20is-2019-20/series_meta.json
../data/yoza/2020/19605-ireland-in-wi-t20is-2019-20
saved ../data/yoza/2020/19605-ireland-in-wi-t20is-2019-20/season_meta.json
saved ../data/yoza/2020/19605-ireland-in-wi-t20is-2019-20/series_meta.json
../data/yoza/2019/19311-bangladesh-in-ind-t20is-2019-20
saved ../data/yoza/2019/19311-bangladesh-in-ind-t20is-2019-20/season_meta.json
saved ../data/yoza/2019/19311-bangladesh-in-ind-t20is-2019-

KeyboardInterrupt: 

../data/yoza/2021/20154-south-africa-in-west-indies-t20is-2021
saved ../data/yoza/2021/20154-south-africa-in-west-indies-t20is-2021/season_meta.json
saved ../data/yoza/2021/20154-south-africa-in-west-indies-t20is-2021/series_meta.json
../data/yoza/2022/20207-sofia-twenty20-2022
saved ../data/yoza/2022/20207-sofia-twenty20-2022/season_meta.json
../data/yoza/2021/20054-uganda-tour-of-namibia-t20is-2021
../data/yoza/2021/20029-afghanistan-v-zimbabwe-t20i-series-2020-21
saved ../data/yoza/2021/20054-uganda-tour-of-namibia-t20is-2021/season_meta.json
saved ../data/yoza/2022/20207-sofia-twenty20-2022/series_meta.json
saved ../data/yoza/2021/20029-afghanistan-v-zimbabwe-t20i-series-2020-21/season_meta.json
saved ../data/yoza/2021/20054-uganda-tour-of-namibia-t20is-2021/series_meta.json
saved ../data/yoza/2021/20029-afghanistan-v-zimbabwe-t20i-series-2020-21/series_meta.json
../data/yoza/2023/20185-central-europe-cup-2023
saved ../data/yoza/2023/20185-central-europe-cup-2023/season_meta.json
.