### Dependencies

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import re

### Settings

In [None]:
# funuction to precise the university
def dynamic_url_to_scrape(university_id = "9671583371665794735"):
    return f"https://scholar.google.com/citations?view_op=view_org&hl=en&org={university_id}&after_author=no-author&astart=0"

def next_page_url(author_id, university_id = "9671583371665794735"):
    return f"https://scholar.google.com/citations?view_op=view_org&hl=en&org={university_id}&after_author={author_id}&astart=0"

### Proxy setup



In [None]:
# Proxy API Key
PROXY_API_KEY = 'c7b595f3-3772-4ef7-bf62-6f3953308c72'

# Function to make requests with proxy rotation
def get_page_with_proxy(url):
    try:
        response = requests.get(
            url='https://proxy.scrapeops.io/v1/',
            params={
                'api_key': PROXY_API_KEY,
                'url': url,
            },
        )
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        raise Exception(f"Request failed: {e}")

### Variables

In [None]:
# crucial variables
visited_links = set()
visited_after_authors = set()

### Scraping Utils

In [None]:
def matching_rule_to_extract_after_author(onclick_text):
    match = re.search(r"window\.location='([^']+)'", onclick_text)
    if match:
        url = match.group(1)

        # Decode escaped characters in the URL
        url = url.replace('\\x3d', '=').replace('\\x26', '&')

        # Extract the 'after_author' parameter value
        after_author_match = re.search(r'after_author=([^&]+)', url)
        if after_author_match:
            return after_author_match.group(1)
    return None

### Scrapping functions

In [None]:
def get_next_page(soup, uni_id = None):
    if not uni_id:
        return None, None
    url = None
    after_author = None
    next_button = soup.find('button', {'aria-label': 'Next'})
    if next_button:
        onclick_text = next_button.get('onclick')
        if onclick_text:
            result = matching_rule_to_extract_after_author(onclick_text)
            if result:
                after_author = result
                url = next_page_url(after_author, uni_id)
                return url, after_author
    return None, None

In [None]:
# MULTI-THREADING
def process_page(soup):
    doctor_data = []
    doctor_container = soup.find(id="gsc_sa_ccl")
    if doctor_container:
        doctors = doctor_container.find_all("div", class_="gsc_1usr")
        # Use ThreadPoolExecutor to process doctors concurrently
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(process_doctor, doctors))
        # Filter out None results
        doctor_data = [doctor for doctor in results if doctor]
        df = pd.DataFrame(doctor_data, columns=["doctor_id", "doctor_name", "department", "disciplines", "publications", "citations", "reads"])
        df.to_csv(f"google_scolar_{len(visited_links)}.csv", index=False)
    else:
        print("Doctor container not found on the page.")

In [None]:
def extract_doctor_id(doctor_div):
    profile_link_tag = doctor_div.find("a", class_="gs_ai_pho")
    if profile_link_tag and profile_link_tag.has_attr("href"):
        profile_link = profile_link_tag["href"]

        # Use regex to extract the user ID
        match = re.search(r"user=([^&]+)", profile_link)
        if match:
            return match.group(1)  # Return the extracted user ID

    # Return "No ID" if not found
    return "No ID"

In [None]:
def extract_doctor_citations(doctor_div):
    citations_tag = doctor_div.find("div", class_="gs_ai_cby")
    if citations_tag:
        citations_text = citations_tag.text.strip()
        match = re.search(r'\d+', citations_text)
        if match:
            return int(match.group(0))
    return 0

In [None]:
def extract_doctor_name(doctor_div):
  name_tag = doctor_div.find("h3", class_="gs_ai_name")
  if name_tag:
    return name_tag.text.strip()
  else:
    return "No Name"

In [None]:
def extract_disciplines(doctor_div):
    disciplines_tag = doctor_div.find("div", class_="gs_ai_int")
    if disciplines_tag:
        links = disciplines_tag.find_all("a")
        if links:
            disciplines = [link.text.strip() for link in links]
            return ", ".join(disciplines)
    return ""

In [None]:
def process_doctor(doctor_div):
  department = "Unkown Yet"
  doctor_id =  extract_doctor_id(doctor_div)
  citations = extract_doctor_citations(doctor_div)
  doctor_name = extract_doctor_name(doctor_div)
  disciplines = extract_disciplines(doctor_div)
  publications = 0
  reads = 0
  return doctor_id, doctor_name, department, disciplines, publications, citations, reads

### Pipeline Utils

In [None]:
def finish_scrapping(next_after_author, next_url):
  return(next_after_author and next_after_author in visited_after_authors) or (next_url and next_url  in visited_links)

### Steps functions

In [None]:
def traverse_single_page(url, university_id):
    print(f"Visiting {url} *_*")
    response = get_page_with_proxy(url)
    visited_links.add(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    process_page(soup)
    return get_next_page(soup, university_id)

def traverse_pages(url, university_id, MAX_ITERATIONS=None):
    if MAX_ITERATIONS and len(visited_links) >= MAX_ITERATIONS:
        print(f"done ^_^ Scraped pages = {MAX_ITERATIONS}")
        return
    next_url, next_after_author = traverse_single_page(url, university_id)
    if finish_scrapping(next_after_author, next_url):
        print("done ^_^")
        return
    visited_after_authors.add(next_after_author)
    traverse_pages(next_url, university_id, MAX_ITERATIONS)

### Main function

In [None]:
def start_scraping(university_id = "9671583371665794735", pages = None):
  if pages:
    traverse_pages(dynamic_url_to_scrape(university_id), university_id, pages)
  else:
    traverse_pages(dynamic_url_to_scrape(university_id), university_id)

### Scraping

In [None]:
university_id = "9671583371665794735"
start_scraping(university_id,pages=2)
print("thank you !")

Visiting https://scholar.google.com/citations?view_op=view_org&hl=en&org=9671583371665794735&after_author=no-author&astart=0 *_*
Visiting https://scholar.google.com/citations?view_op=view_org&hl=en&org=9671583371665794735&after_author=30YeAO_v__8J&astart=0 *_*
done ^_^ Scraped pages = 2
thank you !
