In [4]:
import pandas as pd
from scholarly import scholarly
import requests
import os
import time
from urllib.parse import quote
import json


In [5]:
def clean_filename(title):
    """Clean the filename from invalid characters"""
    return "".join(c for c in title if c.isalnum() or c in (" ", "-", "_")).rstrip()


def get_paper_info(citation):
    """Search paper on Google Scholar and get first result"""
    try:
        # Search for the paper
        search_query = scholarly.search_pubs(citation)
        paper = next(search_query)

        return {
            "title": paper.get("title"),
            "url": paper.get("pub_url"),
            "year": paper.get("year"),
            "doi": paper.get("doi", ""),
            "citation": citation,
        }
    except Exception as e:
        print(f"Error searching for {citation}: {str(e)}")
        return None


In [None]:
# Soit DOI retourné par google scholar, soit DOI recherché sur la page de l'article


In [6]:
article_title = "K.J. Nelson et al. Journal of the electrochemical society, 165, 2018"

print(get_paper_info(article_title))

{'title': None, 'url': 'https://iopscience.iop.org/article/10.1149/2.0041803jes/meta', 'year': None, 'doi': '', 'citation': 'K.J. Nelson et al. Journal of the electrochemical society, 165, 2018'}


In [None]:


def main():
    # Read the Excel file
    df = pd.read_excel("ocr_results.xlsx")

    # Create papers directory if it doesn't exist
    if not os.path.exists("papers"):
        os.makedirs("papers")

    # Dictionary to store paper information
    papers_info = []

    # Process each citation
    for citation in df["Citation"]:
        print(f"\nProcessing: {citation}")

        # Get paper info from Google Scholar
        paper_info = get_paper_info(citation)

        if paper_info:
            papers_info.append(paper_info)
            print(f"Found: {paper_info['title']}")

        # Sleep to avoid hitting rate limits
        time.sleep(2)

    # Save paper information to JSON
    with open("papers_info.json", "w") as f:
        json.dump(papers_info, f, indent=2)

    print("\nPaper information has been saved to papers_info.json")


if __name__ == "__main__":
    main()