In [111]:
import requests
import pandas as pd
import time

def get_authors_and_affiliations(doi, retries=3, delay=5):
    """
    Fetch publication data from OpenAlex API using DOI with retry logic.

    Args:
        doi (str): DOI of the publication.
        retries (int): Number of times to retry on failure.
        delay (int): Delay in seconds between retries.

    Returns:
        dict: JSON response containing publication metadata or an empty dictionary if an error occurs.
    """
    url = f"https://api.openalex.org/works/https://doi.org/{doi}"
    print(f"Fetching -- {url}")
    
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            print(response)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error {response.status_code} fetching DOI {doi}, retrying... ({attempt+1}/{retries})")
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}, retrying... ({attempt+1}/{retries})")
            continue
        
        time.sleep(delay)
    
    print(f"Failed to fetch data for DOI {doi} after {retries} attempts.")
    return {}

def extract_publication_data(publication_data):
    """
    Extracts author details, ORCID ID, affiliation, country, DOI, publication date, publication year, 
    open access status, and type from a given OpenAlex publication dictionary.

    Args:
        publication_data (dict): Dictionary containing publication metadata.

    Returns:
        list: List of dictionaries with extracted information.
    """
    rows = []
    for author in publication_data.get('authorships', []):
        row = {
            'author_full_name': author['author']['display_name'],
            'orcid_id': author['author'].get('orcid', None),
            'affiliation': author['institutions'][0]['display_name'] if author.get('institutions') else None,
            'country': author['institutions'][0].get('country_code', None) if author.get('institutions') else None,
            'doi': publication_data.get('doi', None),
            'publication_date': publication_data.get('publication_date', None),
            'publication_year': publication_data.get('publication_year', None),
            'open_access': publication_data.get('primary_location', {}).get('is_oa', None),
            'type': publication_data.get('type', None)
        }
        rows.append(row)

    return rows

dois = [
    "10.1038/s41591-019-0726-6",
    "10.7189/jogh.09.020318",
    "10.1055/s-0039-1677903",
    "10.1007/s11042-023-16029-x",
    "10.3390/medicina56030141",
    "10.1016/j.ijinfomgt.2021.102387",
    "10.48550/arXiv.1507.05259",
    "10.48550/arXiv.2207.07068",
    "10.3390/sci6010003",
    "10.48550/arXiv.2206.02237",
    "10.3233/SW-223041",
    "10.48550/arXiv.2001.09762",
    "10.1007/s00146-022-01494-z",
    "10.1371/journal.pdig.0000022",
    "10.3390/su15054604",
    "10.2196/36388",
    "10.48550/arXiv.2007.08100",
    "10.24963/ijcai.2017/654",
    "10.1145/3278721.3278764",
    "10.1145/3616865",
    "10.48550/arXiv.2107.06641",
    "10.48550/arXiv.2207.03277",
    "10.1109/ICSE43902.2021.00129",
    "10.1145/3631326",
    "10.48550/arXiv.2102.03054",
    "10.48550/arXiv.2103.06503",
    "10.48550/arXiv.1801.07593",
    "10.1145/3338906.3338937",
    "10.1145/3540250.3549093",
    "10.1145/3468264.3468565",
    "10.1016/j.ipm.2021.102642",
    "10.48550/arXiv.1703.06856",
    "10.1146/annurev-statistics-042720-125902",
    "10.1145/3468507.3468511",
    "10.1016/S2589-7500(20)30292-2",
    "10.1016/j.xcrm.2022.100622",
    "10.1016/j.artmed.2023.102607",
    "10.1259/bjr.20220878",
    "10.1038/s41379-022-01163-y",
    "10.1038/s41551-022-00898-y",
    "10.1145/3269206.3272027",
    "10.48550/arXiv.2110.00530",
    "10.48550/arXiv.1908.09635",
    "10.1145/3494672",
    "10.1145/3269206.3272027",
    "10.1145/3269206.3272027",
    "10.48550/arXiv.1812.11118",
    "10.48550/arXiv.2109.14376",
    "10.1145/3616865",
    "10.1145/3397271.3401051",
    "10.2139/ssrn.2477899",
    "10.48550/arXiv.1707.09457",
    "10.48550/arXiv.1412.3756",
    "10.2139/ssrn.3446944",
    "10.48550/arXiv.2205.13619",
    "10.1145/3652891",
    "10.48550/arXiv.2209.10117",
    "10.1145/3404835.3462966",
    "10.1007/978-3-031-56069-9_46",
    "10.3390/info13100459",
    "10.1016/j.csbj.2020.05.017",
    "10.48550/arXiv.2202.01711",
    "10.48550/arXiv.2005.13755",
    "10.1109/ASE51524.2021.9678568",
    "10.48550/arXiv.2206.04101",
    "10.1109/ICSE43902.2021.00129",
    "10.1145/3524491.3527308",
    "10.48550/arXiv.2003.10354",
    "10.48550/arXiv.2005.12379",
    "10.48550/arXiv.2106.06054",
    "10.48550/arXiv.2105.12195",
    "10.1145/3292500.3332280",
    "10.1016/j.neucom.2021.09.081",
    "10.48550/arXiv.2203.11852",
    "10.48550/arXiv.2001.09784",
    "10.48550/arXiv.2205.05396",
    "10.48550/arXiv.2003.04549",
    "10.48550/arXiv.1906.00066"
]

In [112]:
# Fetch and process publication data for each DOI
all_rows = []
for doi in dois:
    publication_data = get_authors_and_affiliations(doi)
    if publication_data:
        extracted_data = extract_publication_data(publication_data)
        all_rows.extend(extracted_data)

# Convert to DataFrame
df = pd.DataFrame(all_rows)

# Display the DataFrame or save it to a file
if not df.empty:
    print(df)  # Print to console
    df.to_csv("publication_data.csv", index=False)  # Save to CSV
else:
    print("No data extracted.")
# https://api.openalex.org/works/https://doi.org/10.48550/arxiv.2003.04549

Fetching -- https://api.openalex.org/works/https://doi.org/10.1038/s41591-019-0726-6
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.7189/jogh.09.020318
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.1055/s-0039-1677903
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.1007/s11042-023-16029-x
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.3390/medicina56030141
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.1016/j.ijinfomgt.2021.102387
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.1507.05259
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.2207.07068
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.3390/sci6010003
<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.2206.02237
<Res

<Response [200]>
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.2003.10354
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2003.10354, retrying... (1/3)
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2003.10354, retrying... (2/3)
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2003.10354, retrying... (3/3)
Failed to fetch data for DOI 10.48550/arXiv.2003.10354 after 3 attempts.
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.2005.12379
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2005.12379, retrying... (1/3)
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2005.12379, retrying... (2/3)
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2005.12379, retrying... (3/3)
Failed to fetch data for DOI 10.48550/arXiv.2005.12379 after 3 attempts.
Fetching -- https://api.openalex.org/works/https://doi.org/10.48550/arXiv.2106.06054
<Response [404]>
Error 404 fetching DOI 10.48550/arXiv.2106.06054, retry

In [113]:
df

Unnamed: 0,author_full_name,orcid_id,affiliation,country,doi,publication_date,publication_year,open_access,type
0,Jenna Wiens,https://orcid.org/0000-0002-1057-7722,University of Michigan–Ann Arbor,US,https://doi.org/10.1038/s41591-019-0726-6,2020-01-01,2020,False,article
1,W. Nicholson Price,https://orcid.org/0000-0003-0729-290X,University of Michigan–Ann Arbor,US,https://doi.org/10.1038/s41591-019-0726-6,2020-01-01,2020,False,article
2,Michael W. Sjoding,https://orcid.org/0000-0002-0535-9659,University of Michigan–Ann Arbor,US,https://doi.org/10.1038/s41591-019-0726-6,2020-01-01,2020,False,article
3,Trishan Panch,https://orcid.org/0000-0002-6554-061X,Harvard University,US,https://doi.org/10.7189/jogh.09.020318,2019-11-24,2019,True,review
4,Heather Mattie,https://orcid.org/0000-0002-1263-2537,Harvard University,US,https://doi.org/10.7189/jogh.09.020318,2019-11-24,2019,True,review
...,...,...,...,...,...,...,...,...,...
341,Ki Hyun Tae,,Korea Advanced Institute of Science and Techno...,KR,https://doi.org/10.48550/arxiv.2003.04549,2020-01-01,2020,True,preprint
342,Steven Euijong Whang,https://orcid.org/0000-0001-6419-931X,Korea Advanced Institute of Science and Techno...,KR,https://doi.org/10.48550/arxiv.2003.04549,2020-01-01,2020,True,preprint
343,Dennis Wei,https://orcid.org/0000-0002-6510-1537,,,https://doi.org/10.48550/arxiv.1906.00066,2019-01-01,2019,True,preprint
344,Karthikeyan Natesan Ramamurthy,https://orcid.org/0000-0002-6021-5930,,,https://doi.org/10.48550/arxiv.1906.00066,2019-01-01,2019,True,preprint
