### Importing Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

### Settings

In [None]:
# funuction to precise the university
def dynamic_url_to_scrape(university_id = "Lebanese_University"):
    return f"https://www.researchgate.net/institution/{university_id}/members"

### Proxy Setup

In [None]:
# Proxy API Key
PROXY_API_KEY = 'c7b595f3-3772-4ef7-bf62-6f3953308c72'

# Function to make requests with proxy rotation
def get_page_with_proxy(url):
    try:
        response = requests.get(
            url='https://proxy.scrapeops.io/v1/',
            params={
                'api_key': PROXY_API_KEY,
                'url': url,
            },
        )
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        raise Exception(f"Request failed: {e}")

### Scrapping functions

In [None]:
# Function to extract maximum page number from pagination
def get_maximuim_page_number(soup):
    pagination_elements = soup.find_all(class_="nova-legacy-c-pagination__item")
    if pagination_elements:
        return int(pagination_elements[-1].get_text(strip=True))
    return 1  # Default to 1 if no pagination is found

# Function to extract doctors divs
def extract_doctors_divs(soup):
    return soup.find_all(class_="nova-legacy-o-stack__item institution-members-list")

# Function to extract disciplines
def extract_disciplines(div):
    dis_str = ", ".join(discipline.get_text(strip=True) for discipline in div)
    return dis_str

# Function to extract basic doctor information
def extract_doctor_basic_info(doctor_div):
    try:
        doctor_link = doctor_div.find('a', href=True)
        doctor_id = doctor_link['href'].split('/')[-1] if doctor_link else None
        doctor_name = doctor_div.find('a', class_="nova-legacy-e-link").get_text(strip=True) if doctor_div.find('a', class_="nova-legacy-e-link") else None
        department = doctor_div.find('div', class_="nova-legacy-e-text nova-legacy-e-text--size-m nova-legacy-e-text--family-sans-serif nova-legacy-e-text--spacing-none nova-legacy-e-text--color-inherit").get_text(strip=True)
        disciplines = doctor_div.find_all('li', class_="nova-legacy-e-list__item nova-legacy-v-person-list-item__info-section-list-item")
        disciplines = extract_disciplines(disciplines)
        return doctor_id, doctor_name, department, disciplines
    except Exception as e:
        print(f"Error extracting doctor info: {e}")
        return None, None, None, None

# Function to fetch doctor's profile page
def extract_doctor_profile_page(doctor_id):
    url = f"https://www.researchgate.net/profile/{doctor_id}"
    response = get_page_with_proxy(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Function to extract statistics from profile page
def extract_stat(soup):
    try:
        publications = soup.find('div', attrs={'data-testid': 'publicProfileStatsPublications'})
        citations = soup.find('div', attrs={'data-testid': 'publicProfileStatsCitations'})
        reads = soup.find('div', attrs={'data-testid': 'publicProfileStatsReads'})
        return (
            publications.get_text(strip=True) if publications else "0",
            citations.get_text(strip=True) if citations else "0",
            reads.get_text(strip=True) if reads else "0",
        )
    except Exception as e:
        print(f"Error extracting statistics: {e}")
        return "0", "0", "0"

### Steps functions

In [None]:
def add_data_to_df(df, doctor_id, doctor_name, department, disciplines, publications, citations, reads):
    df.loc[len(df.index)] = [doctor_id, doctor_name, department, disciplines, publications, citations, reads]

In [None]:
def process_doctor_divs(df, doctor_divs, max_dcotors_per_page):
    if max_dcotors_per_page is not None:
        doctor_divs = doctor_divs[:max_dcotors_per_page]
    for doctor_div in doctor_divs:
        doctor_info = extract_doctor_basic_info(doctor_div)
        if doctor_info[0]:  # Add to DataFrame only if doctor_id exists
            doctor_id, doctor_name, department, disciplines = doctor_info
            profile_soup = extract_doctor_profile_page(doctor_id)
            publications, citations, reads = extract_stat(profile_soup)
            add_data_to_df(df, doctor_id, doctor_name, department, disciplines, publications, citations, reads)

In [None]:
def process_page(page_number, url_to_scrape, max_dcotors_per_page):
    print(f"Scraping page {page_number}...")
    response = get_page_with_proxy(f"{url_to_scrape}/{page_number}")
    soup = BeautifulSoup(response.content, 'html.parser')
    doctors_divs = extract_doctors_divs(soup)

    # Create a DataFrame for this page
    df = pd.DataFrame(columns=["doctor_id", "doctor_name", "department", "disciplines", "publications", "citations", "reads"])
    process_doctor_divs(df, doctors_divs, max_dcotors_per_page)

    # Save the DataFrame to a CSV file
    df.to_csv(f"research_gate_page_{page_number}.csv", index=False)
    print(f"Page {page_number} data saved to research_gate_page_{page_number}.csv")

### Multi-threading

In [None]:
def process_pages_concurrently(url_to_scrape, max_pages, max_dcotors_per_page):
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_page, page_number, url_to_scrape, max_dcotors_per_page) for page_number in range(1, max_pages + 1)]
        for future in futures:
            future.result()

### Main function

In [None]:
def scrapping_start(url_to_scrape, pages = None, doctors_by_page = None):
  # Main scraping logic
  try:
      response = get_page_with_proxy(url_to_scrape)
      soup = BeautifulSoup(response.content, 'html.parser')
      if pages is None:
          pages = get_maximuim_page_number(soup)
      maximuim_page_number = min(get_maximuim_page_number(soup), pages)
      # it will process each university page that ocontains members
      process_pages_concurrently(url_to_scrape, maximuim_page_number, doctors_by_page)
  except Exception as e:
      print(f"Scraping failed: {e}")

### Scrapping

In [None]:
url_to_scrape = dynamic_url_to_scrape("Lebanese_International_University")
scrapping_start(url_to_scrape, 2 , 7)

Scraping page 1...Scraping page 2...

Page 1 data saved to research_gate_page_1.csv
Page 2 data saved to research_gate_page_2.csv
