In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

def fetch_google_scholar_results(start):
    base_url = "https://scholar.google.com/scholar"
    query_params = {
        "q": "esg+and+economic+growth",
        "hl": "en",
        "as_sdt": "0,47",
        "as_ylo":"2023",
        "start":start
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    results = []  # A list for storing results

    response = requests.get(base_url, params=query_params, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="gs_r gs_or gs_scl")

        for div in divs:
            result = {}
            h3 = div.find("h3")
            if h3:
                a = h3.find("a")
                if a:
                    title = a.get_text()
                    result["Title"] = title

            gs_a = div.find("div", class_="gs_a")
            if gs_a:
                authors = gs_a.get_text()
                result["Authors"] = authors

            results.append(result)  # Add the current result to the list
    else:
        print("Error", response.status_code)
    return results  # Return result list


# Store results for all pages
all_results = []
for i in range(0, 110, 10):  # 110 represents the number of pages to be crawled, adding 10 search results at a time
    results = fetch_google_scholar_results(i)
    all_results.extend(results)
    time.sleep(40) # Delays are added to avoid frequent requests
    print("Already got {} results".format(len(all_results)))

df = pd.DataFrame(all_results)



# Use regular expressions to extract the date and the content after the date and save it as a new column
date_publisher_pattern = r'(\d{4})(.*)'  # Add a capture group, with the date and what comes after the date in parentheses
extracted = df['Authors'].str.extract(date_publisher_pattern)

# Assign the extracted results to a new column
df['Date'] = extracted[0]  # date column
df['Publisher'] = extracted[1].str.strip()  # Remove first and last Spaces for content after the date and save as publisher column

# Delete the date and publisher in the Authors column
df['Authors'] = df['Authors'].str.replace(date_publisher_pattern, '', regex=True).str.strip()

# Write the DataFrame to an Excel file
df.to_excel("google_scholar_results_with_date_publishers.xlsx", index=False)

# Output all results
for result in all_results:
    print(result)