In [1]:
import json

import polars as pl
from crossref.restful import Etiquette, Works
from helpers import (
    CALLS,
    PERIOD,
    bronze_dir,
    silver_dir,
    tool_email,
    tool_name,
    tool_url,
    tool_version,
)
from ratelimit import limits, sleep_and_retry
from tqdm.auto import tqdm

In [7]:
# Load required data
articles = pl.read_ndjson(bronze_dir / "sample_10000.jsonl", ignore_errors=True)
references = pl.read_csv(silver_dir / "references.csv", ignore_errors=True)
ref_metadata = pl.read_ndjson(silver_dir / "ref_metadata.jsonl", ignore_errors=True)

In [14]:
my_etiquette = Etiquette(tool_name, tool_version, tool_url, tool_email)
works = Works(etiquette=my_etiquette)


@sleep_and_retry
@limits(calls=CALLS, period=PERIOD)
def fetch_work(dois):
    query = ",".join([f"doi:{doi}" for doi in dois])
    return works.filter(doi=query).select("DOI, type, published")


def retrieve_doi_info(dois, output_file, batch_size):
    with open(output_file, "a") as f:
        for i in tqdm(range(0, len(dois), batch_size), desc="Retrieving DOIs"):
            batch = dois[i : i + batch_size]
            try:
                for work in fetch_work(batch):
                    f.write(json.dumps(work) + "\n")
            except Exception as e:
                print(f"Error retrieving DOIs {batch}: {e}")

In [None]:
all_dois = references.get_column("cited").drop_nulls().unique()
dois_to_query = set(all_dois).difference(set(ref_metadata.get_column("DOI").to_list()))
len(all_dois), len(ref_metadata), len(dois_to_query)

In [16]:
output_file = silver_dir / "ref_metadata.jsonl"

retrieve_doi_info(list(dois_to_query), output_file, batch_size=50)

Retrieving DOIs:   0%|          | 0/567 [00:00<?, ?it/s]

Error retrieving DOIs ['10.1016/S0040-4039(00)84645-6', '10.1023/A:1007707430416', '10.1177/0272989X9901900216', '10.1016/0006-291X(86)90562-0', '10.1523/JNEUROSCI.1013-11.2011', '10.1007/BFb0048317', '10.1007/BF00032124', '10.1109/TPDS.2010.183', '10.1016/S0926-860X(02)00127-8', '10.1016/0006-291X(92)91634-3', '10.1152/ajpheart.1991.261.6.H1706', '10.1039/D3BM00043E', '10.1017.2018', '10.1016/S0140-6736(01)06102-5', '10.1007/BF03161327', '10.1097/MIB.0000000000000709.', '10.1007/BF01769869', '10.1061/(ASCE)0733-9429(1997)123:4(315)', '10.1061/JRCEA4.0001390', '10.1016/jealeco.2012.11007', '10.1161/CIRCULATIONAHA.111.065391', '10.1016/S0165-0327(02)00332-4', '10.1103/PhysRevLett.108.235502', '10.1017/S0006323196005014', '10.1200/JCO.18.00358', '10.1016/S1055-3290(06)60214-8', '10.1016/0278-6915(94)00145-E', '10.1109/MMM.2017.2759558', '10.1103/PhysRevLett.45.935', '10.1023/A:1017590425924', '10.1371/JOURNAL.PMED.1002885', '10.1128/IAI.66.9.4374-4381.1998', '10.1016/S0044-8486(01)00698-