In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

import pandas as pd

In [2]:
from typing import List,Tuple

In [3]:
def _get_course_components(driver: webdriver) -> Tuple[str, ...]:
    course_components_list: List = []
    for num in range(0, 10):
        try:
            course_components_list.append(
                driver.find_element(By.ID, f"DERIVED_CRSECAT_DESCR${num}").text
            )
        except NoSuchElementException:
            return tuple(course_components_list)

In [4]:
def scrape_sbu_solar(url: str, major_three_letter_code: str, wait_time: int = 10, headless: bool = True, verbose: bool = False) -> pd.DataFrame:
    # TODO:
    #   Add docstring
    #   Add type hints
    #   Add no show option

    # Verify wait time is integer and greater than 0
    wait_time: int = int(wait_time)

    # Headless option
    if headless:
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
    else:
        options = None

    if wait_time < 0:
        raise ValueError("Wait time must be greater than 0 seconds.")

    # Setup Selenium WebDriver
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Get page navigation letter
    major_three_letter_code: str = major_three_letter_code.upper() # Ensure major code is uppercase
    nav_letter: str = major_three_letter_code[0].upper()  # Get first letter of major code

    # NOTE: If Major ID starts with 'A', skip this step.
    #
    # Click on the letter to navigate to the major
    if nav_letter == 'A':
        pass
    else:
        WebDriverWait(driver, wait_time).until(EC.element_to_be_clickable((By.LINK_TEXT, nav_letter))).click()

    # Navigate to major
    WebDriverWait(driver, wait_time).until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, major_three_letter_code))).click()

    # Get table data
    time.sleep(wait_time//2)  # Time to wait for javascript to load the table.
    table: List[webdriver.remote.webelement.WebElement] = driver.find_elements(
        By.TAG_NAME, "tbody"
    )  # This will be a list of tables

    # Remove tables without course number
    # Perform this in reverse order to avoid index errors
    #   if table is removed from the list
    for tab in reversed(table):
        try:
            tab.find_element(By.PARTIAL_LINK_TEXT, 'Course Nbr')
        except (AttributeError, NoSuchElementException):
            table.remove(tab)

    # TODO: Add error handling or warning if table is empty
    try:
        table = table[-1]  # Get the last table
    except IndexError:
        return None

    # Verify table
    if not table.is_displayed():
        raise ValueError("Table is not displayed. Check the URL and major code.")

    if not table.text:
        raise ValueError("Table is empty. Check the URL and major code.")

    # Extract headers
    headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")]

    # Extract rows
    rows = []
    for row in table.find_elements(By.TAG_NAME, "tr"):
        cells = [cell.text for cell in row.find_elements(By.TAG_NAME, "td")]
        if cells:  # This check is to skip rows without table data cells
            rows.append(cells)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Create additional columns for course information
    _course_numbers: List[str] = df['Course Nbr'].tolist() # Get course numbers

    career_list: List[str] = []
    units_list: List[str] = []
    grading_basis_list: List[str] = []
    enrollment_requirement_list: List[str] = []
    course_components_list: List[Tuple[str, ...]] = []
    academic_group_list: List[str] = []
    academic_organization_list: List[str] = []
    description_list: List[str] = []

    if verbose:
        print(f"\nScraping course information for {major_three_letter_code}...\n")

    # Get information for each course
    for course in _course_numbers:

        if verbose:
            print(f"Processing course: {course}...")

        # Wait for the page to load and click on course number
        WebDriverWait(driver, wait_time).until(
            EC.element_to_be_clickable((By.LINK_TEXT, f"{course}"))
        ).click()

        # Use ID to find element -- it is unique.

        # time.sleep(wait_time//2) # Needs to be done every time on a new page.
        # career: str = driver.find_element(By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0").text

        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located((By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0"))
        ).click()

        try:
            career: str = driver.find_element(
                By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0"
            ).text
        except NoSuchElementException:
            career: str = ""

        # Course units
        try:
            units: str = driver.find_element(By.ID, "DERIVED_CRSECAT_UNITS_RANGE$0").text
        except NoSuchElementException:
            units: str = ""

        # Grading basis
        try:
            grading_basis: str = driver.find_element(
                By.ID, "win0divSSR_CRSE_OFF_VW_GRADING_BASIS$0"
            ).text
        except NoSuchElementException:
            grading_basis: str = ""

        # Enrollment requirements (pre-requisites)
        try:
            enrollment_requirement: str = driver.find_element(
                By.ID, "DERIVED_CRSECAT_DESCR254A$0"
            ).text
        except NoSuchElementException:
            enrollment_requirement: str = ""

        # Course components
        course_components: Tuple[str, ...] = _get_course_components(driver)

        # Academic group
        try:
            academic_group: str = driver.find_element(By.ID, "ACAD_GROUP_TBL_DESCR$0").text
        except NoSuchElementException:
            academic_group: str = ""

        # Academic organization
        try:
            academic_organization: str = driver.find_element(
                By.ID, "win0divACAD_ORG_TBL_DESCR$0"
            ).text
        except NoSuchElementException:
            academic_organization: str = ""

        # Course description
        try:
            description: str = driver.find_element(
                By.ID, "SSR_CRSE_OFF_VW_DESCRLONG$0"
            ).text
        except NoSuchElementException:
            description: str = ""

        # Update lists
        career_list.append(career)
        units_list.append(units)
        grading_basis_list.append(grading_basis)
        enrollment_requirement_list.append(enrollment_requirement)
        course_components_list.append(course_components)
        academic_group_list.append(academic_group)
        academic_organization_list.append(academic_organization)
        description_list.append(description)

        # Wait then click to go back to the course list
        WebDriverWait(driver, wait_time//2).until(
            EC.element_to_be_clickable((By.LINK_TEXT, "Return to Browse Course Catalog"))
        ).click()

    # Update DataFrame
    df.insert(df.columns.__len__(), "Career", career_list)
    df.insert(df.columns.__len__(), "Units", units_list)
    df.insert(df.columns.__len__(), "Grading Basis", grading_basis_list)
    df.insert(df.columns.__len__(), "Enrollment Requirement", enrollment_requirement_list)
    df.insert(df.columns.__len__(), "Course Components", course_components_list)
    df.insert(df.columns.__len__(), "Academic Group", academic_group_list)
    df.insert(df.columns.__len__(), "Academic Organization", academic_organization_list)
    df.insert(df.columns.__len__(), "Description", description_list)

    # Quit the driver, close the browser
    driver.quit()

    return df

In [5]:
url = "https://prod.ps.stonybrook.edu/psc/csprodg/EMPLOYEE/CAMP/c/COMMUNITY_ACCESS.SSS_BROWSE_CATLG.GBL?"

In [6]:
scrape_sbu_solar(url=url, major_three_letter_code="cse", wait_time=10, headless=True, verbose=True)


Scraping course information for CSE...

Processing course: 101...
Processing course: 102...
Processing course: 110...
Processing course: (113)...
Processing course: 114...
Processing course: 130...
Processing course: 150...
Processing course: 160...
Processing course: 161...
Processing course: 190...
Processing course: 191...
Processing course: 192...
Processing course: (213)...
Processing course: 214...
Processing course: 215...
Processing course: 216...
Processing course: 220...
Processing course: 230...
Processing course: 260...
Processing course: 261...
Processing course: 300...
Processing course: 301...
Processing course: 303...
Processing course: 304...
Processing course: 305...
Processing course: 306...
Processing course: 307...
Processing course: 310...
Processing course: 311...
Processing course: 312...
Processing course: 316...
Processing course: 320...
Processing course: 323...
Processing course: 325...
Processing course: 327...
Processing course: 328...
Processing course: 

Unnamed: 0,Course Nbr,Course Title,Career,Units,Grading Basis,Enrollment Requirement,Course Components,Academic Group,Academic Organization,Description
0,101,Computer Science Principles,Undergraduate,3.00,Student Option,Prerequisite: Level 3 or higher on the mathema...,"(Laboratory, Lecture)","Engring & App Sci, College of",Computer Science,Introduces central ideas of computing and comp...
1,102,Introduction to Web Design and Programming,Undergraduate,3.00,Student Option,Advisory Prerequisite: CSE 101 or basic comput...,"(Lecture,)",College of Engring & App Sci,Computer Science,"An introduction to the design of Web pages, sp..."
2,110,Introduction to Computer Science,Undergraduate,3.00,Student Option,Prerequisite: Level 3 or higher on the mathema...,"(Laboratory, Lecture)",College of Engring & App Sci,Computer Science,An introduction to fundamentals of computer sc...
3,(113),Foundations of Computer Science I\n\n** availa...,Undergraduate,4.00,A Through F Undergraduate,Prerequisite: AMS 151 or MAT 125 or MAT 131 or...,"(Lecture, Recitation)","Engring & App Sci, College of",Computer Science,Introduction to the mathematical foundations o...
4,114,Introduction to Object-Oriented Programming,Undergraduate,4.00,A Through F Undergraduate,Prerequisite: Level 5 or higher on the math pl...,"(Laboratory, Lecture)","Engring & App Sci, College of",Computer Science,An introduction to procedural and object-orien...
...,...,...,...,...,...,...,...,...,...,...
201,698,Practicum in Teaching,Graduate,0.00 - 3.00,Graduate Graded,,"(Tutorial,)","Engring & App Sci, College of",Computer Science,Supervised teaching in a course identified by ...
202,699,Dissertation Research on Campus,Graduate,0.00 - 9.00,Satisfactory/Unsatisfactory,,"(Tutorial,)","Engring & App Sci, College of",Computer Science,Thesis research for PhD students who have adva...
203,700,Dissertation Research off Campus - Domestic,Graduate,0.00 - 9.00,Satisfactory/Unsatisfactory,Prerequisite: G5 Standing,"(Tutorial,)","Engring & App Sci, College of",Computer Science,Prerequisite: Must be advanced to candidacy (G...
204,701,Dissertation Research off Campus - International,Graduate,0.00 - 9.00,Satisfactory/Unsatisfactory,Prerequisite: G5 Standing,"(Tutorial,)","Engring & App Sci, College of",Computer Science,Prerequisite: Must be advanced to candidacy (G...


In [7]:
# 2//2