In [None]:
# Install required libraries
!pip install scholarly pandas



In [None]:
from scholarly import scholarly
import pandas as pd

In [None]:
from scholarly import scholarly

class ScholarScraper:
    def __init__(self, institution, author_names, emails=None):
        self.institution = institution
        self.author_names = author_names
        self.emails = emails if emails else []
        self.scholar_ids_dict = {}

    def get_scholar_ids_by_institution_and_email(self):
        """
        Fetch Google Scholar IDs for authors from a specified institution or email.

        Returns:
            dict: A dictionary with author names as keys and Scholar IDs as values.
        """
        for name in self.author_names:
            print(f"Searching for: {name}")

            # Search for the author by name
            search_query = scholarly.search_author(name)

            # Iterate through search results
            for author in search_query:

                # Check if the institution or email matches
                if (self.institution.lower() in author['affiliation'].lower() or
                    any(email.lower() in author.get('email', '').lower() for email in self.emails)):
                    print(f"Match found: {author['name']} at {author['affiliation']}")
                    if author.get('email', '').lower():
                        print(f"Email match found: {author['email']}")

                    self.scholar_ids_dict[name] = author['scholar_id']
                    break  # Found the right author, stop looking further
            else:
                # If no match is found

                print(f"No match found for {name} at {self.institution} or with the provided emails")
                self.scholar_ids_dict[name] = None  # Use None to indicate no Scholar ID found
                self.scholar_ids_dict[name] = author['scholar_id']

        return self.scholar_ids_dict

    def get_recent_publications(self, scholar_ids_dict):
        """
        Fetch all publications for each author between 2022 and 2024 using `scholarly`.

        Returns:
            dict: A dictionary where each key is an author's name, and the value is a list of publications
                  between 2022 and 2024 (each as a dict with 'title' and 'year').
        """
        results = {}

        for name, scholar_id in scholar_ids_dict.items():
            if scholar_id is None:
                results[name] = "No Scholar ID available"
                continue

            print(f"Fetching publications for {name} (Scholar ID: {scholar_id})")
            try:
                # Retrieve the author's profile and publications using `scholarly`
                author = scholarly.search_author_id(scholar_id)
                author = scholarly.fill(author)  # This fetches more detailed data about the author

                # Filter publications between 2022 and 2024
                publications = []
                for pub in author['publications']:
                    year = pub['bib'].get('pub_year')
                    if year and year.isdigit():
                        year = int(year)
                        if 2022 <= year <= 2024:  # Filter based on the year range
                            title = pub['bib']['title']
                            publications.append({'title': title, 'year': year})

                results[name] = publications if publications else "No publications found in the specified range"

            except Exception as e:
                print(f"Error fetching data for {name}: {e}")
                results[name] = "Error fetching data"

        return results

    def scrape(self):
        """
        Combines the functionality of fetching Scholar IDs and publications into one process.

        Returns:
            dict: A dictionary containing the Google Scholar IDs and publications between 2022 and 2024 for each author.
        """
        # First, get the Scholar IDs by institution and email
        scholar_ids = self.get_scholar_ids_by_institution_and_email()

        # Then, get the publications for each author between 2022 and 2024
        recent_publications = self.get_recent_publications(scholar_ids)

        # Combine results
        combined_results = {
            'scholar_ids': scholar_ids,
            'recent_publications': recent_publications
        }

        return combined_results

    def display_results(self, results):
        """
        Display the results in a user-friendly format.

        Args:
            results (dict): The dictionary containing Scholar IDs and recent publications.
        """
        print("\nScholar IDs:")
        for name, scholar_id in results['scholar_ids'].items():
            print(f"{name}: {scholar_id}")

        print("\nRecent Publications (2022-2024):")
        for name, pubs in results['recent_publications'].items():
            print(f"\n{name}'s Publications:")
            if isinstance(pubs, str):
                print(pubs)
            else:
                for pub in pubs:
                    print(f"  Title: {pub['title']}, Year: {pub['year']}")


In [None]:
# Example usage:
institution_name = "University of Illinois at Urbana Champaign"
author_names =faculty_names_illinois = [
    "Imad Al-Qadi",
    "Bassem O. Andrawes",
    "Christopher P. L. Barkan",
    "Ana P. Barros",
    "Rahim F. Benekohal",
    "Ximing Cai",
    "Eun Jeong Cha",
    "R. D. Cusick",
    "C. Armando Duarte",
    "J. Riley Edwards",
    "Ahmed Elbanna",
    "Nora El-Gohary",
    "Khaled A. El-Rayes",
    "Rosa M. Espinosa Marzal",
    "Larry A. Fahnestock",
    "Marcelo H. Garcia",
    "Paolo Gardoni",
    "Nishant Garg",
    "Mani Golparvar-Fard",
    "Jeremy Guest",
    "Ramez M. Hajj",
    "Youssef M. A. Hashash",
    "Hannah M. Horowitz",
    "Houtan Jebelli",
    "Nick Jones",
    "Megan Konar",
    "Eleftheria Kontou",
    "Praveen Kumar",
    "James M. LaFave",
    "Lewis J. Lehe",
    "Franklin T. Lombardo",
    "Oscar Lopez-Pamies",
    "Roman Makhnenko",
    "Arif Masud",
    "Megan L. Matthews",
    "Ran Mei",
    "Hadi Meidani",
    "Gholamreza Mesri",
    "Helen Nguyen",
    "Scott M. Olson",
    "Yanfeng Ouyang",
    "John S. Popovics",
    "Jeffery R. Roesler",
    "Murugesu Sivapalan",
    "B. F. Spencer, Jr",
    "Timothy D. Stark",
    "Ashlynn Stillwell",
    "Ann Sychterz",
    "Alireza Talebpour",
    "Alexandre Tartakovsky",
    "Christopher Tessum",
    "Rafael O. Tinoco",
    "Erol Tutumluer",
    "Vishal Verma",
    "Na Wei",
    "Jinhui Yan",
    "X. Shelly Zhang",
    "Lei Zhao"
]





emails = "illinois.edu"  # List of emails to check

# Create an instance of the ScholarScraper class
scraper = ScholarScraper(institution_name, author_names, emails)

# Scrape both Scholar IDs and recent publications
results = scraper.scrape()

# Display the results using the display_results method
scraper.display_results(results)


Searching for: Imad Al-Qadi
No match found for Imad Al-Qadi at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Bassem O. Andrawes
No match found for Bassem O. Andrawes at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Christopher P. L. Barkan
No match found for Christopher P. L. Barkan at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Ana P. Barros
No match found for Ana P. Barros at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Rahim F. Benekohal
No match found for Rahim F. Benekohal at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Ximing Cai
No match found for Ximing Cai at University of Illinois at Urbana Champaign or with the provided emails
Searching for: Eun Jeong Cha
No match found for Eun Jeong Cha at University of Illinois at Urbana Champaign or with the provided emails
Searching for: