## data gathering/ google scholar

In [None]:
from scholarly import scholarly
import pandas as pd

file_path = 'name.csv'
df_authors = pd.read_csv(file_path)

# Reformat the names from the CSV file to "FirstName LastName"
def reformat_name(name):
    split_name = name.split(',')
    if len(split_name) == 2:
        last_name, first_name = split_name[0].strip(), split_name[1].strip()
        return f"{first_name} {last_name}"
    return name

# Reformat names
df_authors['Formatted Names'] = df_authors.iloc[:, 0].apply(reformat_name)

# List of authors
authors_all = df_authors['Formatted Names'].tolist()

# Function to divide list into chunks of 30
def divide_chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
chunk_start = 31
# Split the authors list into chunks of 30
author_chunks = list(divide_chunks(authors_all[(chunk_start-1)*30:], 30))

# Loop through each chunk and process
for idx, authors in enumerate(author_chunks, start=chunk_start):
    # Initialize an empty list to store paper data
    paper_data = []
    n = 0
    # Search Google Scholar for each author and retrieve their papers
    for author in authors:
        search_query = scholarly.search_author(author)
        author_obj = next(search_query, None)  # Fetch the first result for the author
        print(n)
        n+=1
        if author_obj:
            author_filled = scholarly.fill(author_obj)  # Retrieve full author details
            author_papers = author_filled['publications']
            for paper in author_papers:
                # Retrieve paper details: title
                title = paper.get('bib', {}).get('title', 'No title available')

                # Fetch publication year and convert it to integer if it exists
                year = paper.get('bib', {}).get('pub_year', None)
                if year is not None:
                    try:
                        year = int(year)  # Convert year to integer
                    except ValueError:
                        continue  # Skip this paper if the year is not a valid integer

                # Only include papers from 2022 to 2024
                if year and 2022 <= year <= 2024:
                    # Fetch abstract: Scholarly may not always provide an abstract
                    abstract = paper.get('bib', {}).get('abstract', 'No abstract available')

                    # Try to fill the paper's details, which might contain the abstract in some cases
                    paper_filled = scholarly.fill(paper)
                    if 'abstract' in paper_filled.get('bib', {}):
                        abstract = paper_filled['bib']['abstract']  # Update abstract if found

                    # Append the paper details
                    paper_data.append({
                        'Author': author,
                        'Title': title,
                        'Abstract': abstract,
                        'Year': year
                    })

    # Convert to a DataFrame
    df_papers = pd.DataFrame(paper_data)

    # Save the data to a CSV file for the current chunk
    output_csv_path = f'author_papers_2022_2024_part_{idx}.csv'
    df_papers.to_csv(output_csv_path, index=False)

    # Output the location of the saved CSV file for the current chunk
    print(f"Papers data for part {idx} saved to: {output_csv_path}")


## combine all the data

In [None]:
import os
import pandas as pd

# List of expected chunk files
chunk_files = [f'author_papers_2022_2024_part_{i}.csv' for i in range(1, 63)]

# Initialize a list to store dataframes
dataframes = []

# Iterate through the chunk files and add existing ones
for file in chunk_files:
    if os.path.exists(file):
        try:
            df = pd.read_csv(file)
            dataframes.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
    else:
        print(f"File not found: {file}. Skipping...")

# Combine all valid dataframes
if dataframes:
    df_combined = pd.concat(dataframes, ignore_index=True)
    df_combined.to_csv('all_author_papers_2022_2024.csv', index=False)
    print("All chunks combined and saved to all_author_papers_2022_2024.csv")
else:
    print("No files were found to combine.")
