In [136]:
import requests
import pandas as pd
import time
from itertools import islice

def chunk_list(iterable, size):
    """Yield successive chunks from the iterable of the specified size."""
    it = iter(iterable)
    while True:
        chunk = list(islice(it, size))
        if not chunk:
            break
        yield chunk

def get_publication_data(dois, retries=3, delay=5):
    """
    Fetch publication data from OpenAlex API using multiple DOIs with retry logic.

    Args:
        dois (list): List of DOIs.
        retries (int): Number of times to retry on failure.
        delay (int): Delay in seconds between retries.

    Returns:
        list: List of dictionaries containing publication metadata.
    """
    all_results = []
    for chunk in chunk_list(dois, 50):
        pipe_separated_dois = "|".join(chunk)
        url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page=50"
        print(f"Fetching -- {url}")
        
        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                print(response)
                if response.status_code == 200:
                    # 'results' will contain a list of publication objects
                    all_results.extend(response.json().get("results", []))
                    break
                else:
                    print(
                        f"Error {response.status_code} fetching DOIs, "
                        f"retrying... ({attempt+1}/{retries})"
                    )
            except requests.exceptions.RequestException as e:
                print(f"Request error: {e}, retrying... ({attempt+1}/{retries})")
            
            time.sleep(delay)

    return all_results

def extract_grants(publication):
    """
    Safely extracts grant information from the publication metadata.

    Returns an empty list if the 'grants' field is not a list or has invalid structure.
    """
    grants_extracted = []
    grants_data = publication.get("grants", [])

    # If grants_data is not a list, return empty list
    if not isinstance(grants_data, list):
        return grants_extracted

    # Process each item in the grants list
    for grant in grants_data:
        # If the individual grant is not a dictionary, skip it
        if not isinstance(grant, dict):
            continue

        # The 'funder' field may also be a string in rare cases, so check
        funder_info = grant.get("funder", {})
        if not isinstance(funder_info, dict):
            continue

        # Now safely extract needed fields
        grants_extracted.append({
            "funder": funder_info.get("display_name", None),
            "funder_id": funder_info.get("id", None),
            "award_id": grant.get("award_id", None),
        })

    return grants_extracted

def extract_keywords(publication):
    """
    Extract keywords from the publication metadata as a list of display names.
    """
    kw_data = publication.get('keywords', [])
    if not isinstance(kw_data, list):
        return []
    
    keywords_list = []
    for kw in kw_data:
        if isinstance(kw, dict):
            display_name = kw.get('display_name')
            if display_name:
                keywords_list.append(display_name)
    return keywords_list

def extract_publication_data(publication_data):
    """
    Extracts core metadata (e.g., author details, ORCID ID, affiliation, country, 
    author position, is_corresponding, raw affiliation strings, 
    DOI, publication date, open access status, keywords, grants, etc.) 
    from OpenAlex publication dictionaries.

    Args:
        publication_data (list): List of dictionaries containing publication metadata.

    Returns:
        list: List of dictionaries with extracted information.
    """
    rows = []
    for publication in publication_data:
        # Some works may have no 'authorships' at all; handle that gracefully
        authorships = publication.get('authorships', [])
        if not isinstance(authorships, list):
            continue

        # Extract the keywords once per publication
        keywords_list = extract_keywords(publication)

        for author in authorships:
            # Institutions can be multiple
            institution_info = author.get('institutions')
            affiliations = []
            countries = []
            if institution_info and isinstance(institution_info, list):
                for inst in institution_info:
                    if isinstance(inst, dict):
                        # gather affiliation
                        aff = inst.get('display_name')
                        if aff:
                            affiliations.append(aff)
                        # gather country codes if needed
                        ctry = inst.get('country_code')
                        if ctry:
                            countries.append(ctry)

            # Construct the row of metadata
            row = {
                'id': publication.get('id', None),
                'title': publication.get('title', None),
                'display_name': publication.get('display_name', None),

                # Author-level fields
                'author_full_name': author.get('author', {}).get('display_name', None),
                'orcid_id': author.get('author', {}).get('orcid', None),
                'author_position': author.get('author_position', None),
                'is_corresponding': author.get('is_corresponding', None),
                'raw_affiliation_strings': author.get('raw_affiliation_strings', None),

                # Possibly multiple affiliations
                'affiliations': affiliations,
                'countries': countries,

                # Publication-level fields
                'doi': publication.get('doi', None),
                'publication_date': publication.get('publication_date', None),
                'publication_year': publication.get('publication_year', None),
                'type': publication.get('type', None),
                'language': publication.get('language', None),

                # Open access fields from the 'open_access' object
                'open_access': publication.get('open_access', {}).get('is_oa', None),
                'open_access_status': publication.get('open_access', {}).get('oa_status', None),
                'open_access_url': publication.get('open_access', {}).get('oa_url', None),

                # Additional fields
                'cited_by_count': publication.get('cited_by_count', None),

                # Keywords (same for each author in the same publication)
                'keywords': keywords_list,

                # Grants
                'grants': extract_grants(publication)
            }
            rows.append(row)

    return rows


In [137]:
dois = [
    "10.1038/s41591-019-0726-6",
    "10.7189/jogh.09.020318",
    "10.1055/s-0039-1677903",
    "10.1007/s11042-023-16029-x",
    "10.3390/medicina56030141",
    "10.1016/j.ijinfomgt.2021.102387",
    "10.48550/arXiv.1507.05259",
    "10.48550/arXiv.2207.07068",
    "10.3390/sci6010003",
    "10.48550/arXiv.2206.02237",
    "10.3233/SW-223041",
    "10.48550/arXiv.2001.09762",
    "10.1007/s00146-022-01494-z",
    "10.1371/journal.pdig.0000022",
    "10.3390/su15054604",
    "10.2196/36388",
    "10.48550/arXiv.2007.08100",
    "10.24963/ijcai.2017/654",
    "10.1145/3278721.3278764",
    "10.1145/3616865",
    "10.48550/arXiv.2107.06641",
    "10.48550/arXiv.2207.03277",
    "10.1109/ICSE43902.2021.00129",
    "10.1145/3631326",
    "10.48550/arXiv.2102.03054",
    "10.48550/arXiv.2103.06503",
    "10.48550/arXiv.1801.07593",
    "10.1145/3338906.3338937",
    "10.1145/3540250.3549093",
    "10.1145/3468264.3468565",
    "10.1016/j.ipm.2021.102642",
    "10.48550/arXiv.1703.06856",
    "10.1146/annurev-statistics-042720-125902",
    "10.1145/3468507.3468511",
    "10.1016/S2589-7500(20)30292-2",
    "10.1016/j.xcrm.2022.100622",
    "10.1016/j.artmed.2023.102607",
    "10.1259/bjr.20220878",
    "10.1038/s41379-022-01163-y",
    "10.1038/s41551-022-00898-y",
    "10.1145/3269206.3272027",
    "10.48550/arXiv.2110.00530",
    "10.48550/arXiv.1908.09635",
    "10.1145/3494672",
    "10.1145/3269206.3272027",
    "10.1145/3269206.3272027",
    "10.48550/arXiv.1812.11118",
    "10.48550/arXiv.2109.14376",
    "10.1145/3616865",
    "10.1145/3397271.3401051",
    "10.2139/ssrn.2477899",
    "10.48550/arXiv.1707.09457",
    "10.48550/arXiv.1412.3756",
    "10.2139/ssrn.3446944",
    "10.48550/arXiv.2205.13619",
    "10.1145/3652891",
    "10.48550/arXiv.2209.10117",
    "10.1145/3404835.3462966",
    "10.1007/978-3-031-56069-9_46",
    "10.3390/info13100459",
    "10.1016/j.csbj.2020.05.017",
    "10.48550/arXiv.2202.01711",
    "10.48550/arXiv.2005.13755",
    "10.1109/ASE51524.2021.9678568",
    "10.48550/arXiv.2206.04101",
    "10.1109/ICSE43902.2021.00129",
    "10.1145/3524491.3527308",
    "10.48550/arXiv.2003.10354",
    "10.48550/arXiv.2005.12379",
    "10.48550/arXiv.2106.06054",
    "10.48550/arXiv.2105.12195",
    "10.1145/3292500.3332280",
    "10.1016/j.neucom.2021.09.081",
    "10.48550/arXiv.2203.11852",
    "10.48550/arXiv.2001.09784",
    "10.48550/arXiv.2205.05396",
    "10.48550/arXiv.2003.04549",
    "10.48550/arXiv.1906.00066"
]
data = get_publication_data(dois)
records = extract_publication_data(data)

Fetching -- https://api.openalex.org/works?filter=doi:10.1038/s41591-019-0726-6|10.7189/jogh.09.020318|10.1055/s-0039-1677903|10.1007/s11042-023-16029-x|10.3390/medicina56030141|10.1016/j.ijinfomgt.2021.102387|10.48550/arXiv.1507.05259|10.48550/arXiv.2207.07068|10.3390/sci6010003|10.48550/arXiv.2206.02237|10.3233/SW-223041|10.48550/arXiv.2001.09762|10.1007/s00146-022-01494-z|10.1371/journal.pdig.0000022|10.3390/su15054604|10.2196/36388|10.48550/arXiv.2007.08100|10.24963/ijcai.2017/654|10.1145/3278721.3278764|10.1145/3616865|10.48550/arXiv.2107.06641|10.48550/arXiv.2207.03277|10.1109/ICSE43902.2021.00129|10.1145/3631326|10.48550/arXiv.2102.03054|10.48550/arXiv.2103.06503|10.48550/arXiv.1801.07593|10.1145/3338906.3338937|10.1145/3540250.3549093|10.1145/3468264.3468565|10.1016/j.ipm.2021.102642|10.48550/arXiv.1703.06856|10.1146/annurev-statistics-042720-125902|10.1145/3468507.3468511|10.1016/S2589-7500(20)30292-2|10.1016/j.xcrm.2022.100622|10.1016/j.artmed.2023.102607|10.1259/bjr.20220878

In [138]:
df = pd.DataFrame(records)

In [139]:
df

Unnamed: 0,id,title,display_name,author_full_name,orcid_id,author_position,is_corresponding,raw_affiliation_strings,affiliations,countries,...,publication_date,publication_year,type,language,open_access,open_access_status,open_access_url,cited_by_count,keywords,grants
0,https://openalex.org/W2753845591,Counterfactual Fairness,Counterfactual Fairness,Matt J. Kusner,,first,False,[The Alan Turing Institute and University of W...,"[The Alan Turing Institute, University of Warw...","[GB, GB]",...,2017-01-01,2017,preprint,en,True,green,https://arxiv.org/abs/1703.06856,897,[Intuition],[]
1,https://openalex.org/W2753845591,Counterfactual Fairness,Counterfactual Fairness,Joshua R. Loftus,https://orcid.org/0000-0002-2905-1632,middle,False,[New York University and The Alan Turing Insti...,[],[],...,2017-01-01,2017,preprint,en,True,green,https://arxiv.org/abs/1703.06856,897,[Intuition],[]
2,https://openalex.org/W2753845591,Counterfactual Fairness,Counterfactual Fairness,Chris Russell,https://orcid.org/0000-0003-1665-1759,middle,False,[The Alan Turing Institute and University of S...,"[The Alan Turing Institute, University of Surrey]","[GB, GB]",...,2017-01-01,2017,preprint,en,True,green,https://arxiv.org/abs/1703.06856,897,[Intuition],[]
3,https://openalex.org/W2753845591,Counterfactual Fairness,Counterfactual Fairness,Ricardo Silva,https://orcid.org/0000-0002-6502-9563,last,False,[The Alan Turing Institute and University Coll...,"[The Alan Turing Institute, University College...","[GB, GB]",...,2017-01-01,2017,preprint,en,True,green,https://arxiv.org/abs/1703.06856,897,[Intuition],[]
4,https://openalex.org/W3121368818,Approval of artificial intelligence and machin...,Approval of artificial intelligence and machin...,Urs J. Muehlematter,https://orcid.org/0000-0003-3423-4633,first,False,[Institute of Diagnostic and Interventional Ra...,"[University Hospital of Zurich, University of ...","[CH, CH]",...,2021-01-19,2021,review,en,True,gold,http://www.thelancet.com/article/S258975002030...,426,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,https://openalex.org/W3012104127,Slice Tuner: A Selective Data Acquisition Fram...,Slice Tuner: A Selective Data Acquisition Fram...,Ki Hyun Tae,,first,False,"[KAIST,Daejeon,Republic of Korea]",[Korea Advanced Institute of Science and Techn...,[KR],...,2020-01-01,2020,preprint,en,True,green,https://arxiv.org/abs/2003.04549,0,"[Tuner, Crowdsourcing]",[]
330,https://openalex.org/W3012104127,Slice Tuner: A Selective Data Acquisition Fram...,Slice Tuner: A Selective Data Acquisition Fram...,Steven Euijong Whang,https://orcid.org/0000-0001-6419-931X,last,False,"[KAIST,Daejeon,Republic of Korea]",[Korea Advanced Institute of Science and Techn...,[KR],...,2020-01-01,2020,preprint,en,True,green,https://arxiv.org/abs/2003.04549,0,"[Tuner, Crowdsourcing]",[]
331,https://openalex.org/W4281666526,What-is and How-to for Fairness in Machine Lea...,What-is and How-to for Fairness in Machine Lea...,Zeyu Tang,https://orcid.org/0000-0002-4423-4728,first,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2206.04101,0,"[Flow chart, Reflection]",[]
332,https://openalex.org/W4281666526,What-is and How-to for Fairness in Machine Lea...,What-is and How-to for Fairness in Machine Lea...,Jiji Zhang,https://orcid.org/0000-0003-0684-2084,middle,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2206.04101,0,"[Flow chart, Reflection]",[]


In [140]:
df.tail(50)

Unnamed: 0,id,title,display_name,author_full_name,orcid_id,author_position,is_corresponding,raw_affiliation_strings,affiliations,countries,...,publication_date,publication_year,type,language,open_access,open_access_status,open_access_url,cited_by_count,keywords,grants
284,https://openalex.org/W4297899430,A Comprehensive Survey on Trustworthy Recommen...,A Comprehensive Survey on Trustworthy Recommen...,Yiqi Wang,https://orcid.org/0000-0002-9657-3617,middle,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2209.10117,11,"[Trustworthiness, Robustness]",[]
285,https://openalex.org/W4297899430,A Comprehensive Survey on Trustworthy Recommen...,A Comprehensive Survey on Trustworthy Recommen...,Xu Han,https://orcid.org/0000-0002-8967-4372,middle,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2209.10117,11,"[Trustworthiness, Robustness]",[]
286,https://openalex.org/W4297899430,A Comprehensive Survey on Trustworthy Recommen...,A Comprehensive Survey on Trustworthy Recommen...,Lei Chen,https://orcid.org/0000-0002-8257-5806,middle,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2209.10117,11,"[Trustworthiness, Robustness]",[]
287,https://openalex.org/W4297899430,A Comprehensive Survey on Trustworthy Recommen...,A Comprehensive Survey on Trustworthy Recommen...,Qing Li,https://orcid.org/0000-0003-3370-471X,last,False,[],[],[],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2209.10117,11,"[Trustworthiness, Robustness]",[]
288,https://openalex.org/W4322507603,A Survey on Fairness for Machine Learning on G...,A Survey on Fairness for Machine Learning on G...,Manvi Choudhary,,first,False,[Laboratoire Hubert Curien],[Laboratoire Hubert Curien],[FR],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2205.05396,11,[],[]
289,https://openalex.org/W4322507603,A Survey on Fairness for Machine Learning on G...,A Survey on Fairness for Machine Learning on G...,Charlotte Laclau,https://orcid.org/0000-0002-7389-3191,middle,False,[Laboratoire Hubert Curien],[Laboratoire Hubert Curien],[FR],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2205.05396,11,[],[]
290,https://openalex.org/W4322507603,A Survey on Fairness for Machine Learning on G...,A Survey on Fairness for Machine Learning on G...,Christine Largeron,https://orcid.org/0000-0003-1059-4095,last,False,[Laboratoire Hubert Curien],[Laboratoire Hubert Curien],[FR],...,2022-01-01,2022,preprint,en,True,green,https://arxiv.org/abs/2205.05396,11,[],[]
291,https://openalex.org/W3028820554,Review of Mathematical frameworks for Fairness...,Review of Mathematical frameworks for Fairness...,Eustasio del Barrio,https://orcid.org/0000-0003-3764-5411,first,False,[],[],[],...,2020-01-01,2020,preprint,en,True,green,https://arxiv.org/abs/2005.13755,17,[],[]
292,https://openalex.org/W3028820554,Review of Mathematical frameworks for Fairness...,Review of Mathematical frameworks for Fairness...,Paula Gordaliza,https://orcid.org/0000-0002-0455-1200,middle,False,[],[],[],...,2020-01-01,2020,preprint,en,True,green,https://arxiv.org/abs/2005.13755,17,[],[]
293,https://openalex.org/W3028820554,Review of Mathematical frameworks for Fairness...,Review of Mathematical frameworks for Fairness...,Jean-Michel Loubès,https://orcid.org/0000-0002-1252-2960,last,False,[Institut de Mathématiques de Toulouse UMR5219],[Institut de Mathématiques de Toulouse],[FR],...,2020-01-01,2020,preprint,en,True,green,https://arxiv.org/abs/2005.13755,17,[],[]


In [142]:
if not df.empty: 
    df.to_csv("publication_data.csv", index=False)  # Save to CSV
else:
    print("No data extracted.")