In [1]:
import os
os.environ["http_proxy"]="127.0.0.1:7890"
os.environ["https_proxy"]="127.0.0.1:7890"

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [3]:
from urllib.parse import urljoin

def fetch_google_scholar_results(inquiry, as_ylo, start, hl="zh-CN"):
    base_url = "https://scholar.google.com/scholar"
    query_params = {
        "q": inquiry,
        "hl": hl,
        "as_sdt": "0,5",
        "as_ylo": as_ylo,
        "start": start
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    results = []

    response = requests.get(base_url, params=query_params, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="gs_r gs_or gs_scl")

        for div in divs:
            result = {}
            h3 = div.find("h3")
            if h3:
                a = h3.find("a")
                if a:
                    title = a.get_text()
                    result["Title"] = title
                    href = a.get('href')
                    result["href"] = href
                    
            gs_a = div.find("div", class_="gs_a")
            if gs_a:
                basic_info = gs_a.get_text()
                result["basic_info"] = basic_info

            gs_rs = div.find("div", class_="gs_rs")
            if gs_rs:
                abstract = gs_rs.get_text()
                result["concise_abstract"] = abstract

            gs_or_nvi = div.find("a", class_="gs_or_nvi")
            if gs_or_nvi:
                snapshot = gs_or_nvi.get('href')
                if snapshot and snapshot != "javascript:void(0)":
                    try:
                        second_response = requests.get(snapshot)
                        second_response.raise_for_status()  # Raise an exception for bad status codes
                    except requests.exceptions.InvalidSchema:
                        # 如果是无效的URL架构，可能是相对路径，将其转换为绝对路径
                        snapshot_url = urljoin("https://scholar.google.com", snapshot)
                        try:
                            second_response = requests.get(snapshot_url)
                            second_response.raise_for_status()
                        except requests.exceptions.RequestException as e:
                            print(f"Error occurred while fetching secondary URL (after conversion): {e}")
                            continue
                    except requests.exceptions.RequestException as e:
                        print(f"Error occurred while fetching secondary URL: {e}")
                        continue
                    
                    if second_response.status_code == 200:
                        second_soup = BeautifulSoup(second_response.content, "html.parser")
                        article_abstract_div = second_soup.find("div", id="articleAbstract")
                        if article_abstract_div is not None: 
                            articleAbstract = article_abstract_div.get_text()
                            result["full_abstract"] = articleAbstract

            results.append(result)
    elif response.status_code == 429:
        retry_after = int(response.headers.get("Retry-After", 1))
        print(f"Too many requests. Retrying after {retry_after} seconds...")
    else:
        print("Error", response.status_code)
    return results

In [4]:
q = "educationalpsychologydefinition"
as_ylo = 2022

# Store results for all pages
all_results = []
for i in range(0, 10, 10):  # 110 represents the number of pages to be crawled, adding 10 search results at a time
    results = fetch_google_scholar_results(q, as_ylo, i)
    all_results.extend(results)
    time.sleep(40) # Delays are added to avoid frequent requests
    print("Already got {} results".format(len(all_results)))

df = pd.DataFrame(all_results)

# # Use regular expressions to extract the date and the content after the date and save it as a new column
# date_publisher_pattern = r'(\d{4})(.*)'  # Add a capture group, with the date and what comes after the date in parentheses
# extracted = df['Authors'].str.extract(date_publisher_pattern)

# # Assign the extracted results to a new column
# df['Date'] = extracted[0]  # date column
# df['Publisher'] = extracted[1].str.strip()  # Remove first and last Spaces for content after the date and save as publisher column

# # Delete the date and publisher in the Authors column
# df['Authors'] = df['Authors'].str.replace(date_publisher_pattern, '', regex=True).str.strip()

# # Write the DataFrame to an Excel file
# df.to_excel("google_scholar_results_with_date_publishers.xlsx", index=False)

# # Output all results
# for result in all_results:
#     print(result)

Too many requests. Retrying after 1 seconds...
Already got 0 results


In [5]:
df.head(5)

Unnamed: 0,basic_info,concise_abstract
0,SNS Page - Language,


In [None]:
import requests
import time

def make_request_with_retry(url, max_retries=5):
    retries = 0
    while retries < max_retries:
        response = requests.get(url)
        if response.status_code == 429:
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Too many requests. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
            retries += 1
        else:
            return response
    raise Exception("Max retries exceeded")

url = "http://example.com"
response = make_request_with_retry(url)
print(response.content)

# 数据持久化

In [None]:
import pandas as pd
import os

# 定义持久化文件路径
persist_file = 'data/data_crawled/googel_scholar.csv'

# 初始化持久化存储
if not os.path.exists(persist_file):
    # 如果持久化文件不存在，创建一个空的 DataFrame 并保存
    df_persist = pd.DataFrame(columns=['Title', 'href', 'basic_info', 'concise_abstract', 'full_abstract'])
    df_persist.to_csv(persist_file, index=False)
else:
    # 如果持久化文件存在，读取文件内容
    df_persist = pd.read_csv(persist_file)

#######################################################################################
q = "全球胜任力培养"
as_ylo = 2020

# Store results for all pages
all_results = []
for i in range(0, 10, 10):  # 110 represents the number of pages to be crawled, adding 10 search results at a time
    results = fetch_google_scholar_results(q, as_ylo, i)
    all_results.extend(results)
    # time.sleep(40) # Delays are added to avoid frequent requests
    print("Already got {} results".format(len(all_results)))

df_new_results = pd.DataFrame(all_results)
df_new_results = df_new_results.dropna(subset=['Title']) # 删除Title列为NaN的行
#######################################################################################

# 将新爬取的结果转换为 DataFrame
# df_new_results = pd.DataFrame(new_results)

# 比较并筛选未爬取过的数据
df_combined = pd.concat([df_persist, df_new_results]).drop_duplicates(subset=['Title'], keep=False)
df_new_unique = df_combined[df_combined.index >= len(df_persist)]

# 如果有新数据，更新持久化存储
if not df_new_unique.empty:
    df_persist = pd.concat([df_persist, df_new_unique])
    df_persist.to_csv(persist_file, index=False)
    print(f"Added {len(df_new_unique)} new entries to the persistent storage.")
else:
    print("No new entries to add.")

# 打印更新后的持久化存储内容
# print(df_persist)

In [None]:
df_new_results.head(10)

# 总代码

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

# 定义持久化文件路径
persist_file = 'google_scholar.csv'

# 初始化持久化存储
if not os.path.exists(persist_file):
    # 如果持久化文件不存在，创建一个空的 DataFrame 并保存
    df_persist = pd.DataFrame(columns=['Title', 'href', 'basic_info', 'concise_abstract', 'full_abstract'])
    df_persist.to_csv(persist_file, index=False)
else:
    # 如果持久化文件存在，读取文件内容
    df_persist = pd.read_csv(persist_file)

In [None]:
df_persist.head(3)

In [None]:
def fetch_google_scholar_results(q, as_ylo, start, hl="zh-CN"):
    global df_persist
    """
    q：查询的内容，如有多个，用“+”连接。
    hl：指定界面语言，默认为中文，也可以选择“en”。
    as_ylo：指定文章的起始年份。
    start：谷歌学术默认一页10篇paper，所以start为10的倍数（考虑0），代表从第几页开始爬取。
    """
    base_url = "https://scholar.google.com/scholar"
    query_params = {
        "q": q,
        "hl": hl,
        "as_sdt": "0,5",
        "as_ylo": as_ylo,
        "start": start
    }  # hl=zh-CN表示中国大陆的中文；%2C表示逗号；%2B表示加号；标准是一页10个paper，so start表示从第几页开始

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    results = []  # A list for storing results
    total_articles = 0
    already_exists_count = 0
    title_none_count = 0
    added_count = 0

    response = requests.get(base_url, params=query_params, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        divs = soup.find_all("div", class_="gs_r gs_or gs_scl")

        for div in divs:
            total_articles += 1
            result = {}
            h3 = div.find("h3")
            if h3:
                a = h3.find("a")  # 爬取文章名称和href(链接地址)
                if a:
                    title = a.get_text()
                    href = a.get('href')
                    if title in df_persist['Title'].values:
                        already_exists_count += 1
                        continue  # 如果标题已经存在，跳过这个结果
                    result["Title"] = title
                    result["href"] = href
                else:
                    title_none_count += 1
                    result["Title"] = None
                    result["href"] = None
            else:
                title_none_count += 1
                result["Title"] = None
                result["href"] = None

            gs_a = div.find("div", class_="gs_a")  # 爬取作者 期刊 年份 期刊网址
            if gs_a:
                basic_info = gs_a.get_text()
                result["basic_info"] = basic_info

            gs_rs = div.find("div", class_="gs_rs")  # 爬取一部分摘要
            if gs_rs:
                abstract = gs_rs.get_text()
                result["concise_abstract"] = abstract

            gs_or_nvi = div.find("a", class_="gs_or_nvi")  # 爬取网页快照上的完整版摘要
            if gs_or_nvi:
                snapshot = gs_or_nvi.get('href')
                if snapshot != "javascript:void(0)":  # 如果爬取下来的是一个网址，再到二级网址中爬取具体的摘要
                    second_response = requests.get(snapshot)
                    if second_response.status_code == 200:
                        second_soup = BeautifulSoup(second_response.content, "html.parser")
                        article_abstract_div = second_soup.find("div", id="articleAbstract")
                        if article_abstract_div is not None:
                            articleAbstract = article_abstract_div.get_text()
                            result["full_abstract"] = articleAbstract
                    else:
                        print("An error occurred while crawling the secondary URL!", second_response.status_code)

            results.append(result)  # Add the current result to the list
            if result["Title"] is not None and result["Title"] not in df_persist['Title'].values:
                df_persist.loc[len(df_persist)] = result  # Add new result to the persistent DataFrame
                added_count += 1
    else:
        print("Error", response.status_code)

    df_persist = df_persist.dropna(subset=['Title']) # 删除Title列为NaN的行
    df_persist.to_csv(persist_file, index=False)  # Save the updated DataFrame to CSV
    print(f"Total articles found: {total_articles}")
    print(f"{already_exists_count} entries already existed in the persistent storage.")
    print(f"{title_none_count} articles had no title.")
    print(f"Actually added {added_count} new entries to the persistent storage.")
    return results  # Return result list

In [None]:
# 示例调用
results = fetch_google_scholar_results("全球胜任力培养", 2023, 0)