In [2]:
import pandas as pd
from scholarly import scholarly
from habanero import Crossref
import os
import time

# Get article information from Google Scholar (author, year, title) and then use Crossref to get the DOI with those informations

In [3]:
def get_doi_from_query(query, email="your_email@example.com"):
    # Step 1: Search on Google Scholar
    search_results = scholarly.search_pubs(query, patents=False)

    try:
        # Retrieve the first result
        first_result = next(search_results)
    except StopIteration:
        print("No results found on Google Scholar.")
        return None

    # Extract useful fields (author, year, title)
    authors = first_result["bib"].get("author", [])
    pub_year = first_result["bib"].get("pub_year", "")
    title = first_result["bib"].get("title", "")
    
    # Step 2: Query Crossref
    cr = Crossref(mailto=email)
    
    # Construct a query based on author(s), year, and title
    # Join authors with a comma and add the title in quotes
    crossref_query = f"{', '.join(authors)} {pub_year} \"{title}\""
    
    try:
        # Limit to 1 result and request only the DOI field
        response = cr.works(query=crossref_query, limit=1, select="DOI")
        doi = response["message"]["items"][0]["DOI"]
        # Optional prefix if you want to return the full DOI URL
        doi_url = "https://doi.org/" + doi
        print(f"DOI found for {query}: {doi_url}")
        return doi_url
    except Exception as e:
        print("Unable to retrieve DOI from Crossref:", e)
        return None


# Get DOI and save to references_with_doi.csv

In [None]:
# Import the sheet
df = pd.read_csv("../3_ocr_results_clean_to_references/references_without_doi.csv")

# Add a column with the DOI
df['DOI'] = ""

# Create the references_with_doi.csv file if it doesn't exist
if not os.path.exists("references_with_doi.csv"):
    df.to_csv("references_with_doi.csv", index=False)
    print("File created")
else:
    print("File already exists")

# Complet the sheet_with_doi.csv file with the DOI
for index, row in df.iterrows():    
    if pd.isna(row['DOI']):
        print(f"Searching DOI for {row['Reference']}")
        df.at[index, 'DOI'] = get_doi_from_query(row['Reference'])
        df.to_csv("references_with_doi.csv", index=False)
        # Wait 10 seconds
        time.sleep(10)
    else:
        print(f"DOI already found for {row['Reference']}")