In [3]:
"""Retrieve the first 10 results from each search"""
import csv
import requests
from bs4 import BeautifulSoup

phrase_list1 = ['machine learning', 'artificial intelligence', 'deep learning', 'reinforcement learning', 'neural network']
phrase_list2 = ['venous thrombosis', 'venous thromboembolism', 'pulmonary embolism', 'deep vein thrombosis']

# Create a CSV file to store the results
with open('scholar_results.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Query1','Query2','Title', 'Authors', 'Year', 'Abstract', 'Citations', 'URL'])
    
    # Loop over phrases from both lists
    for phrase1 in phrase_list1:
        for phrase2 in phrase_list2:
            # Construct the search query URL
            search_query = f'https://scholar.google.com/scholar?q="{phrase1}"+"{phrase2}"'
            
            # Variable to track the page number
            page_number = 0
            
            while True:
                # Increment the page number for pagination
                page_number += 1
                
                # Construct the URL for the current page
                page_url = f"{search_query}&start={(page_number - 1) * 10}"
                
                # Send a GET request to the current page URL
                response = requests.get(page_url)
                
                # Parse the response using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find all the search result items
                search_results = soup.find_all('div', {'class': 'gs_ri'})
                
                # If no search results found, break out of the loop
                if not search_results:
                    break
                
                # Extract information from each search result and write it to the CSV file
                for result in search_results:
                    title_element = result.find('h3', {'class': 'gs_rt'})
                    title = title_element.text.strip() if title_element else ''
                    
                    authors_element = result.find('div', {'class': 'gs_a'})
                    authors = authors_element.text.strip() if authors_element else ''
                    
                    year_element = result.find('span', {'class': 'gs_age'})
                    year = year_element.text.strip() if year_element else ''
                    
                    abstract_element = result.find('div', {'class': 'gs_rs'})
                    abstract = abstract_element.text.strip() if abstract_element else ''
                    
                    citations_element = result.find('div', {'class': 'gs_fl'})
                    citations = citations_element.text.strip() if citations_element else ''
                    
                    url_element = title_element.find('a')
                    url = url_element['href'] if url_element else ''
                    
                    writer.writerow([query1, query2, title, authors, year, abstract, citations, url])

print("Search results exported to scholar_results.csv")

Search results exported to scholar_results.csv


In [1]:
"""Retrieve all the results"""
import csv
import requests
from bs4 import BeautifulSoup

# Lists of exact phrases
phrase_list1 = ['deep learning']
phrase_list2 = ['deep vein thrombosis']

# Create a CSV file to store the results
with open('scholar_results.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Query1', 'Query2', 'Title', 'Authors', 'Year', 'Abstract', 'Citations', 'URL'])

    # Loop over phrases from both lists
    for phrase1 in phrase_list1:
        for phrase2 in phrase_list2:
            page = 0  # Current page number
            while True:
                # Construct the search query URL with pagination
                #search_query = f'https://scholar.google.com/scholar?q="{phrase1}"+"{phrase2}"&start={page * 10}'
                search_query = f'https://scholar.google.com/scholar?start={page * 10}&q="{phrase1}""{phrase2}"&hl=en&as_sdt=0,5'

                # Send a GET request to the search query URL
                response = requests.get(search_query)

                # Parse the response using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')

                # Find all the search result items
                search_results = soup.find_all('div', {'class': 'gs_ri'})

                # If no search results found, break the loop
                if not search_results:
                    break

                # Extract information from each search result and write it to the CSV file
                for result in search_results:
                    query1 = phrase1
                    query2 = phrase2
                    title_element = result.find('h3', {'class': 'gs_rt'})
                    title = title_element.text.strip() if title_element else ''

                    authors_element = result.find('div', {'class': 'gs_a'})
                    authors = authors_element.text.strip() if authors_element else ''

                    year_element = result.find('span', {'class': 'gs_age'})
                    year = year_element.text.strip() if year_element else ''

                    abstract_element = result.find('div', {'class': 'gs_rs'})
                    abstract = abstract_element.text.strip() if abstract_element else ''

                    citations_element = result.find('div', {'class': 'gs_fl'})
                    citations = citations_element.text.strip() if citations_element else ''

                    url_element = title_element.find('a')
                    url = url_element['href'] if url_element else ''

                    writer.writerow([query1, query2, title, authors, year, abstract, citations, url])

                page += 1

print("Search results exported to scholar_results.csv")

Search results exported to scholar_results.csv
