In [None]:
import json
import math

from crossref.restful import Etiquette, Works
from ratelimit import limits, sleep_and_retry
from tqdm.auto import tqdm

from helpers import (
    CALLS,
    PERIOD,
    bronze_dir,
    tool_email,
    tool_name,
    tool_url,
    tool_version,
)

In [None]:
my_etiquette = Etiquette(tool_name, tool_version, tool_url, tool_email)
works = Works(etiquette=my_etiquette)


@sleep_and_retry
@limits(calls=CALLS, period=PERIOD)
def fetch_sample(batch_size=100):
    return works.sample(sample_size=batch_size)


def calculate_batches(total_size, batch_size):
    return math.ceil(total_size / batch_size)


def download_samples(sample_size: int, batch_size: int, output_file):
    with open(output_file, "w") as f:
        num_batches = calculate_batches(sample_size, batch_size)
        for _ in tqdm(range(num_batches), desc="Downloading samples"):
            response = fetch_sample(batch_size=batch_size)
            for item in response:
                f.write(json.dumps(item) + "\n")

In [None]:
sample_size = 100000
batch_size = 100

output_file = bronze_dir / f"sample_{sample_size}.jsonl"
download_samples(sample_size, batch_size, output_file)

Downloading samples:   0%|          | 0/1000 [00:00<?, ?it/s]