In [2]:
# Example DOI list
dois = [
    "doi.org/10.1038/s41591-019-0726-6",
]

In [None]:
import requests
import pandas as pd
import time
from itertools import islice


# Split a list into chunks of specified size
def chunk_list(iterable, size):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, size))
        if not chunk:
            break
        yield chunk


# Clean DOI strings by removing common prefixes
def clean_doi(doi):
    prefixes_to_remove = [
        "https://doi.org/",
        "http://doi.org/",
        "doi.org/",
        "DOI:",
        "doi:",
        "https://dx.doi.org/",
        "http://dx.doi.org/",
    ]

    cleaned_doi = doi.strip()
    for prefix in prefixes_to_remove:
        if cleaned_doi.startswith(prefix):
            cleaned_doi = cleaned_doi[len(prefix) :]
            break

    return cleaned_doi


# Fetch publication metadata from OpenAlex API with retry mechanism
def get_publication_data(dois, retries=3, delay=5):
    all_results = []
    cleaned_dois = [clean_doi(doi) for doi in dois]
    print(f"Cleaned DOIs: {cleaned_dois[:5]}...")

    for chunk in chunk_list(cleaned_dois, 50):
        pipe_separated_dois = "|".join(chunk)
        url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page=50"
        print(f"Fetching -- {url}")

        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                print(f"Response: {response.status_code}")
                if response.status_code == 200:
                    results = response.json().get("results", [])
                    print(f"Found {len(results)} publications in this chunk")
                    all_results.extend(results)
                    break
                else:
                    print(
                        f"Error {response.status_code} fetching DOIs, retrying... ({attempt+1}/{retries})"
                    )
            except requests.exceptions.RequestException as e:
                print(f"Request error: {e}, retrying... ({attempt+1}/{retries})")

            time.sleep(delay)

    print(f"Total publications found: {len(all_results)}")
    return all_results


# Extract grant information from publication data safely
def extract_grants(publication):
    grants_extracted = []
    grants_data = publication.get("grants", [])

    if not isinstance(grants_data, list):
        return grants_extracted

    for grant in grants_data:
        if not isinstance(grant, dict):
            continue

        funder_info = grant.get("funder", {})
        if not isinstance(funder_info, dict):
            continue

        grants_extracted.append(
            {
                "funder": funder_info.get("display_name", None),
                "funder_id": funder_info.get("id", None),
                "award_id": grant.get("award_id", None),
            }
        )

    return grants_extracted


# Extract keywords as a list of display names
def extract_keywords(publication):
    kw_data = publication.get("keywords", [])
    if not isinstance(kw_data, list):
        return []

    keywords_list = []
    for kw in kw_data:
        if isinstance(kw, dict):
            display_name = kw.get("display_name")
            if display_name:
                keywords_list.append(display_name)

    return keywords_list


# Country code to country name mapping
COUNTRY_CODE_TO_NAME = {
    "US": "USA",
    "GB": "United Kingdom",
    "CA": "Canada",
    # ... (other mappings)
    "NP": "Nepal",
}


# Extract relevant metadata from publication list
def extract_publication_data(publication_data):
    publication_rows = []

    for publication in publication_data:
        authorships = publication.get("authorships", [])
        if not isinstance(authorships, list):
            continue

        keywords_list = extract_keywords(publication)

        all_authors = []
        all_affiliations = []
        all_countries = []

        for author in authorships:
            author_name = author.get("author", {}).get("display_name", None)
            if author_name:
                all_authors.append(author_name)

            author_affiliation = ""
            author_country = ""

            institution_info = author.get("institutions")
            if (
                institution_info
                and isinstance(institution_info, list)
                and len(institution_info) > 0
            ):
                primary_inst = institution_info[0]
                if isinstance(primary_inst, dict):
                    aff = primary_inst.get("display_name")
                    if aff:
                        author_affiliation = aff

                    ctry_code = primary_inst.get("country_code")
                    if ctry_code:
                        ctry_name = COUNTRY_CODE_TO_NAME.get(ctry_code, ctry_code)
                        author_country = ctry_name

            all_affiliations.append(author_affiliation)
            all_countries.append(author_country)

        publication_row = {
            "id": publication.get("id", None),
            "title": publication.get("title", None),
            "display_name": publication.get("display_name", None),
            "all_authors": "; ".join(all_authors) if all_authors else "",
            "all_affiliations": "; ".join(all_affiliations) if all_affiliations else "",
            "all_countries": "; ".join(all_countries) if all_countries else "",
            "doi": publication.get("doi", None),
            "publication_date": publication.get("publication_date", None),
            "publication_year": publication.get("publication_year", None),
            "type": publication.get("type", None),
            "language": publication.get("language", None),
            "open_access": publication.get("open_access", {}).get("is_oa", None),
            "open_access_status": publication.get("open_access", {}).get(
                "oa_status", None
            ),
            "open_access_url": publication.get("open_access", {}).get("oa_url", None),
            "cited_by_count": publication.get("cited_by_count", None),
            "keywords": "; ".join(keywords_list) if keywords_list else "",
            "grants": (
                "; ".join(
                    [
                        grant.get("funder", "")
                        for grant in extract_grants(publication)
                        if grant.get("funder")
                    ]
                )
                if extract_grants(publication)
                else ""
            ),
        }

        publication_rows.append(publication_row)

    return publication_rows


# Order DataFrame according to original DOI list and handle missing entries
def order_by_doi_sequence(df, original_dois):
    cleaned_original_dois = [clean_doi(doi) for doi in original_dois]

    if df.empty:
        na_rows = [
            {
                "id": "N/A",
                "title": "N/A",
                "display_name": "N/A",
                "all_authors": "N/A",
                "all_affiliations": "N/A",
                "all_countries": "N/A",
                "doi": f"https://doi.org/{doi}",
                "publication_date": "N/A",
                "publication_year": "N/A",
                "type": "N/A",
                "language": "N/A",
                "open_access": "N/A",
                "open_access_status": "N/A",
                "open_access_url": "N/A",
                "cited_by_count": "N/A",
                "keywords": "N/A",
                "grants": "N/A",
            }
            for doi in cleaned_original_dois
        ]
        return pd.DataFrame(na_rows)

    def extract_doi_from_url(doi_url):
        if pd.isna(doi_url) or not doi_url:
            return ""
        return (
            doi_url.replace("https://doi.org/", "")
            if doi_url.startswith("https://doi.org/")
            else doi_url
        )

    found_dois = set(extract_doi_from_url(doi) for doi in df["doi"])

    missing_dois = [doi for doi in cleaned_original_dois if doi not in found_dois]

    na_rows = [
        {
            "id": "N/A",
            "title": "N/A",
            "display_name": "N/A",
            "all_authors": "N/A",
            "all_affiliations": "N/A",
            "all_countries": "N/A",
            "doi": f"https://doi.org/{doi}",
            "publication_date": "N/A",
            "publication_year": "N/A",
            "type": "N/A",
            "language": "N/A",
            "open_access": "N/A",
            "open_access_status": "N/A",
            "open_access_url": "N/A",
            "cited_by_count": "N/A",
            "keywords": "N/A",
            "grants": "N/A",
        }
        for doi in missing_dois
    ]

    df_with_na = (
        pd.concat([df, pd.DataFrame(na_rows)], ignore_index=True) if na_rows else df
    )

    doi_to_order = {doi: i for i, doi in enumerate(cleaned_original_dois)}

    df_with_na["doi_order"] = df_with_na["doi"].apply(
        lambda x: doi_to_order.get(extract_doi_from_url(x), 999999)
    )

    df_ordered = (
        df_with_na.sort_values("doi_order")
        .drop(columns=["doi_order"])
        .reset_index(drop=True)
    )

    return df_ordered


# Fetch publication data
print("Starting data extraction...")
data = get_publication_data(dois)
print(f"Processing {len(data)} publications...")

if not data:
    print("No data extracted. Adding N/A rows...")
    df_final = order_by_doi_sequence(pd.DataFrame(), dois)
else:
    records = extract_publication_data(data)
    df = pd.DataFrame(records)

    seen_ids = set()
    indices_to_keep = [
        i
        for i, pub_id in enumerate(df["id"])
        if not (pub_id in seen_ids or seen_ids.add(pub_id))
    ]

    df_deduplicated = df.iloc[indices_to_keep].reset_index(drop=True)

    df_final = order_by_doi_sequence(df_deduplicated, dois)

# Save results to CSV
df_final.to_csv("publication_data_enhanced_(1).csv", index=False)
print("Data saved to publication_data_enhanced_(93-94).csv")
print(df_final[["title", "all_authors", "all_affiliations", "all_countries"]].head(3))
print(f"Final dataset: {len(df_final)} publications (including any N/A rows)")

Starting data extraction...
Cleaned DOIs: ['10.1038/s41591-019-0726-6']...
Fetching -- https://api.openalex.org/works?filter=doi:10.1038/s41591-019-0726-6&per-page=50
Response: 200
Found 1 publications in this chunk
Total publications found: 1
Processing 1 publications...
Data saved to publication_data_enhanced_(93-94).csv
                                               title  \
0  Diagnosing bias in data-driven algorithms for ...   

                                         all_authors  \
0  Jenna Wiens; W. Nicholson Price; Michael W. Sj...   

                                    all_affiliations  all_countries  
0  University of Michigan; University of Michigan...  USA; USA; USA  
Final dataset: 1 publications (including any N/A rows)


In [6]:
df = pd.DataFrame(records)

In [8]:
df.tail(10)

Unnamed: 0,id,title,display_name,all_authors,all_affiliations,all_countries,doi,publication_date,publication_year,type,language,open_access,open_access_status,open_access_url,cited_by_count,keywords,grants
0,https://openalex.org/W4388757737,Intentional Biases in LLM Responses,Intentional Biases in LLM Responses,Nicklaus Badyal; Derek Jacoby; Yvonne Coady,University of Victoria; University of Victoria...,Canada; Canada; Canada,https://doi.org/10.1109/uemcon59035.2023.10316060,2023-10-12,2023,article,en,True,green,https://arxiv.org/pdf/2311.07611,4,Viewpoints; Persona; Supervisor,


In [None]:
### EXTRACT WITH ABSTRACT

In [6]:
import requests
import pandas as pd
import time
from itertools import islice


# Split a list into chunks of specified size
def chunk_list(iterable, size):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, size))
        if not chunk:
            break
        yield chunk


# Clean DOI strings by removing common prefixes
def clean_doi(doi):
    prefixes_to_remove = [
        "https://doi.org/",
        "http://doi.org/",
        "doi.org/",
        "DOI:",
        "doi:",
        "https://dx.doi.org/",
        "http://dx.doi.org/",
    ]

    cleaned_doi = doi.strip()
    for prefix in prefixes_to_remove:
        if cleaned_doi.startswith(prefix):
            cleaned_doi = cleaned_doi[len(prefix) :]
            break

    return cleaned_doi


# Fetch publication metadata from OpenAlex API with retry mechanism
def get_publication_data(dois, retries=3, delay=5):
    all_results = []
    cleaned_dois = [clean_doi(doi) for doi in dois]
    print(f"Cleaned DOIs: {cleaned_dois[:5]}...")

    for chunk in chunk_list(cleaned_dois, 50):
        pipe_separated_dois = "|".join(chunk)
        url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page=50"
        print(f"Fetching -- {url}")

        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                print(f"Response: {response.status_code}")
                if response.status_code == 200:
                    results = response.json().get("results", [])
                    print(f"Found {len(results)} publications in this chunk")
                    all_results.extend(results)
                    break
                else:
                    print(
                        f"Error {response.status_code} fetching DOIs, retrying... ({attempt+1}/{retries})"
                    )
            except requests.exceptions.RequestException as e:
                print(f"Request error: {e}, retrying... ({attempt+1}/{retries})")

            time.sleep(delay)

    print(f"Total publications found: {len(all_results)}")
    return all_results


# Extract grant information from publication data safely
def extract_grants(publication):
    grants_extracted = []
    grants_data = publication.get("grants", [])

    if not isinstance(grants_data, list):
        return grants_extracted

    for grant in grants_data:
        if not isinstance(grant, dict):
            continue

        funder_info = grant.get("funder", {})
        if not isinstance(funder_info, dict):
            continue

        grants_extracted.append(
            {
                "funder": funder_info.get("display_name", None),
                "funder_id": funder_info.get("id", None),
                "award_id": grant.get("award_id", None),
            }
        )

    return grants_extracted


# Extract keywords as a list of display names
def extract_keywords(publication):
    kw_data = publication.get("keywords", [])
    if not isinstance(kw_data, list):
        return []

    keywords_list = []
    for kw in kw_data:
        if isinstance(kw, dict):
            display_name = kw.get("display_name")
            if display_name:
                keywords_list.append(display_name)

    return keywords_list


# Extract abstract from publication data
def extract_abstract(publication):
    """Extract abstract from publication data, handling missing abstracts gracefully"""
    abstract_data = publication.get("abstract", None)

    if abstract_data is None:
        return ""

    if isinstance(abstract_data, str):
        return abstract_data.strip()

    # Sometimes abstract might be in a different format, handle accordingly
    if isinstance(abstract_data, dict):
        # Check for common abstract fields
        for field in ["text", "content", "abstract", "summary"]:
            if field in abstract_data and abstract_data[field]:
                return str(abstract_data[field]).strip()

    return ""


# Country code to country name mapping
COUNTRY_CODE_TO_NAME = {
    "US": "USA",
    "GB": "United Kingdom",
    "CA": "Canada",
    # ... (other mappings)
    "NP": "Nepal",
}


# Extract relevant metadata from publication list
def extract_publication_data(publication_data):
    publication_rows = []

    for publication in publication_data:
        authorships = publication.get("authorships", [])
        if not isinstance(authorships, list):
            continue

        keywords_list = extract_keywords(publication)
        abstract_text = extract_abstract(publication)

        all_authors = []
        all_affiliations = []
        all_countries = []

        for author in authorships:
            author_name = author.get("author", {}).get("display_name", None)
            if author_name:
                all_authors.append(author_name)

            author_affiliation = ""
            author_country = ""

            institution_info = author.get("institutions")
            if (
                institution_info
                and isinstance(institution_info, list)
                and len(institution_info) > 0
            ):
                primary_inst = institution_info[0]
                if isinstance(primary_inst, dict):
                    aff = primary_inst.get("display_name")
                    if aff:
                        author_affiliation = aff

                    ctry_code = primary_inst.get("country_code")
                    if ctry_code:
                        ctry_name = COUNTRY_CODE_TO_NAME.get(ctry_code, ctry_code)
                        author_country = ctry_name

            all_affiliations.append(author_affiliation)
            all_countries.append(author_country)

        publication_row = {
            "id": publication.get("id", None),
            "title": publication.get("title", None),
            "display_name": publication.get("display_name", None),
            "all_authors": "; ".join(all_authors) if all_authors else "",
            "all_affiliations": "; ".join(all_affiliations) if all_affiliations else "",
            "all_countries": "; ".join(all_countries) if all_countries else "",
            "doi": publication.get("doi", None),
            "publication_date": publication.get("publication_date", None),
            "publication_year": publication.get("publication_year", None),
            "type": publication.get("type", None),
            "language": publication.get("language", None),
            "open_access": publication.get("open_access", {}).get("is_oa", None),
            "open_access_status": publication.get("open_access", {}).get(
                "oa_status", None
            ),
            "open_access_url": publication.get("open_access", {}).get("oa_url", None),
            "cited_by_count": publication.get("cited_by_count", None),
            "keywords": "; ".join(keywords_list) if keywords_list else "",
            "grants": (
                "; ".join(
                    [
                        grant.get("funder", "")
                        for grant in extract_grants(publication)
                        if grant.get("funder")
                    ]
                )
                if extract_grants(publication)
                else ""
            ),
            "abstract": abstract_text,  # Added abstract column at the end
        }

        publication_rows.append(publication_row)

    return publication_rows


# Order DataFrame according to original DOI list and handle missing entries
def order_by_doi_sequence(df, original_dois):
    cleaned_original_dois = [clean_doi(doi) for doi in original_dois]

    if df.empty:
        na_rows = [
            {
                "id": "N/A",
                "title": "N/A",
                "display_name": "N/A",
                "all_authors": "N/A",
                "all_affiliations": "N/A",
                "all_countries": "N/A",
                "doi": f"https://doi.org/{doi}",
                "publication_date": "N/A",
                "publication_year": "N/A",
                "type": "N/A",
                "language": "N/A",
                "open_access": "N/A",
                "open_access_status": "N/A",
                "open_access_url": "N/A",
                "cited_by_count": "N/A",
                "keywords": "N/A",
                "grants": "N/A",
                "abstract": "N/A",  # Added abstract field to N/A rows
            }
            for doi in cleaned_original_dois
        ]
        return pd.DataFrame(na_rows)

    def extract_doi_from_url(doi_url):
        if pd.isna(doi_url) or not doi_url:
            return ""
        return (
            doi_url.replace("https://doi.org/", "")
            if doi_url.startswith("https://doi.org/")
            else doi_url
        )

    found_dois = set(extract_doi_from_url(doi) for doi in df["doi"])

    missing_dois = [doi for doi in cleaned_original_dois if doi not in found_dois]

    na_rows = [
        {
            "id": "N/A",
            "title": "N/A",
            "display_name": "N/A",
            "all_authors": "N/A",
            "all_affiliations": "N/A",
            "all_countries": "N/A",
            "doi": f"https://doi.org/{doi}",
            "publication_date": "N/A",
            "publication_year": "N/A",
            "type": "N/A",
            "language": "N/A",
            "open_access": "N/A",
            "open_access_status": "N/A",
            "open_access_url": "N/A",
            "cited_by_count": "N/A",
            "keywords": "N/A",
            "grants": "N/A",
            "abstract": "N/A",  # Added abstract field to missing DOI rows
        }
        for doi in missing_dois
    ]

    df_with_na = (
        pd.concat([df, pd.DataFrame(na_rows)], ignore_index=True) if na_rows else df
    )

    doi_to_order = {doi: i for i, doi in enumerate(cleaned_original_dois)}

    df_with_na["doi_order"] = df_with_na["doi"].apply(
        lambda x: doi_to_order.get(extract_doi_from_url(x), 999999)
    )

    df_ordered = (
        df_with_na.sort_values("doi_order")
        .drop(columns=["doi_order"])
        .reset_index(drop=True)
    )

    return df_ordered


# Example usage (you'll need to define your DOIs list):
# dois = ["10.1038/nature12373", "10.1126/science.1234567"]  # Add your DOIs here

# Fetch publication data
print("Starting data extraction...")
data = get_publication_data(dois)
print(f"Processing {len(data)} publications...")

if not data:
    print("No data extracted. Adding N/A rows...")
    df_final = order_by_doi_sequence(pd.DataFrame(), dois)
else:
    records = extract_publication_data(data)
    df = pd.DataFrame(records)

    seen_ids = set()
    indices_to_keep = [
        i
        for i, pub_id in enumerate(df["id"])
        if not (pub_id in seen_ids or seen_ids.add(pub_id))
    ]

    df_deduplicated = df.iloc[indices_to_keep].reset_index(drop=True)

    df_final = order_by_doi_sequence(df_deduplicated, dois)

# Save results to CSV
df_final.to_csv("abstract_publication_data.csv", index=False)
print("Data saved to publication_data_enhanced_(93-94).csv")
print(
    df_final[
        ["title", "all_authors", "all_affiliations", "all_countries", "abstract"]
    ].head(3)
)
print(f"Final dataset: {len(df_final)} publications (including any N/A rows)")

Starting data extraction...
Cleaned DOIs: ['10.1038/s41591-019-0726-6']...
Fetching -- https://api.openalex.org/works?filter=doi:10.1038/s41591-019-0726-6&per-page=50
Response: 200
Found 1 publications in this chunk
Total publications found: 1
Processing 1 publications...
Data saved to publication_data_enhanced_(93-94).csv
                                               title  \
0  Diagnosing bias in data-driven algorithms for ...   

                                         all_authors  \
0  Jenna Wiens; W. Nicholson Price; Michael W. Sj...   

                                    all_affiliations  all_countries abstract  
0  University of Michigan; University of Michigan...  USA; USA; USA           
Final dataset: 1 publications (including any N/A rows)
