In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import csv
import time

import pandas as pd

In [20]:
from typing import List

In [21]:
def scrape_sbu_solar(url: str, major_three_letter_code: str, wait_time: int = 10):
    # TODO:
    #   Add docstring
    #   Add type hints
    #   Add no show option

    # Verify wait time is integer and greater than 0
    wait_time: int = int(wait_time)

    if wait_time < 0:
        raise ValueError("Wait time must be greater than 0 seconds.")

    # Setup Selenium WebDriver
    driver = webdriver.Chrome()
    driver.get(url)

    # Get page navigation letter
    major_three_letter_code: str = major_three_letter_code.upper() # Ensure major code is uppercase
    nav_letter: str = major_three_letter_code[0].upper()  # Get first letter of major code

    # NOTE: If Major ID starts with 'A', skip this step.
    #
    # Click on the letter to navigate to the major
    if nav_letter == 'A':
        pass
    else:
        WebDriverWait(driver, wait_time).until(EC.element_to_be_clickable((By.LINK_TEXT, nav_letter))).click()

    # Navigate to major
    WebDriverWait(driver, wait_time).until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, major_three_letter_code))).click()

    # Get table data
    time.sleep(wait_time//2)  # Time to wait for javascript to load the table.
    table: List[selenium.webdriver.remote.webelement.WebElement] = driver.find_elements(
        By.TAG_NAME, "tbody"
    )  # This will be a list of tables

    # Remove tables without course number
    # Perform this in reverse order to avoid index errors
    #   if table is removed from the list
    for tab in reversed(table):
        try:
            tab.find_element(By.PARTIAL_LINK_TEXT, 'Course Nbr')
        except (AttributeError, NoSuchElementException):
            table.remove(tab)

    # TODO: Add error handling or warning if table is empty
    try:
        table = table[-1]  # Get the last table
    except IndexError:
        return None

    # Verify table
    if not table.is_displayed():
        raise ValueError("Table is not displayed. Check the URL and major code.")

    if not table.text:
        raise ValueError("Table is empty. Check the URL and major code.")

    # Extract headers
    headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")]

    # Extract rows
    rows = []
    for row in table.find_elements(By.TAG_NAME, "tr"):
        cells = [cell.text for cell in row.find_elements(By.TAG_NAME, "td")]
        if cells:  # This check is to skip rows without table data cells
            rows.append(cells)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df

    driver.quit()

    return df

In [22]:
url = "https://prod.ps.stonybrook.edu/psc/csprodg/EMPLOYEE/CAMP/c/COMMUNITY_ACCESS.SSS_BROWSE_CATLG.GBL?"

In [25]:
scrape_sbu_solar(url, 'cme',5)

Unnamed: 0,Course Nbr,Course Title
0,101,Introduction to Chemical and Molecular Enginee...
1,160,Introduction to Nanoscience and Nanotechnology
2,199,Introduction to Undergraduate Research
3,201,Sustainable Energy - Evaluating the Options
4,233,Ethics and Business Practices for Engineers
...,...,...
67,698,CME 698 Practicum in Teaching
68,699,Dissertation Research on Campus
69,700,Dissertation Research off Campus
70,701,Dissertation Research off Campus-International
