In [3]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json
from openai import OpenAI

In [13]:
# Read list from a JSON file
with open('data/department_list.json', 'r') as file:
    department_list = json.load(file)

In [42]:
# Base URL for course descriptions
BASE_URL = "https://vancouver.calendar.ubc.ca/course-descriptions/subject/"

# Function to scrape courses for a department
def scrape_department_courses(department_code):
    url = f"{BASE_URL}{department_code}v"
    response = requests.get(url)

    if response.status_code == 404:
        print(f"Failed to fetch data for {department_code}: {response.status_code}")
        return [], []

    if response.status_code != 200:
        print(f"Failed to fetch data for {department_code}: {response.status_code}")
        return [], [department_code]


    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all course blocks (using updated selectors based on the provided HTML structure)
    courses = []
    for course in soup.find_all("article", class_="node node--type-course node--promoted node--view-mode-teaser"):
        # Extract course code, name, and credit hours from the h3 tag
        header = course.find("h3")
        if header:
            course_details = header.text.strip()
            # Regex to match the course code, credit, and course name
            match = re.match(r"(?P<code>[A-Z]+_V \d+) \((?P<credits>\d+)\)\s+(?P<name>.+)$", course_details)
            if match:
                course_code = match.group('code')
                course_name = match.group('name')
                credits = int(match.group('credits'))
            else:
                continue
        
        # Extract course description and prerequisites from the p tag
        description_block = course.find("p")
        if description_block:
            description_text = description_block.text.strip()
            # Regex to extract prerequisites text
            prereq_match = re.search(r"Prerequisite[s]*: (.+?)(?:\s[Cc]orequisite[s]*:|$)", description_text)
            prerequisites = prereq_match.group(1) if prereq_match else None
            coreq_match = re.search(r"Corequisite[s]*: (.+)", description_text, re.IGNORECASE)
            corequisites = coreq_match.group(1) if coreq_match else None

            # Append course data
            courses.append({
                "course_code": course_code,
                "course_name": course_name,
                "credits": credits,
                "description": description_text,
                "prerequisites": prerequisites,
                "corequisites": corequisites
            })

    return courses, []

In [43]:
# Scrape multiple departments
departments = department_list  # Add more department codes as needed(!!!, replace the full course list after finish future design)
all_courses = []
fail_courses = []

for dept in departments:
    print(f"Scraping {dept}...")
    courses, fail = scrape_department_courses(dept)
    all_courses.extend(courses)
    fail_courses.extend(fail)

Scraping ACAM...
Scraping ADHE...
Scraping AFST...
Scraping AMNE...
Scraping ANAT...
Scraping ANTH...
Scraping APBI...
Scraping APSC...
Scraping ARBC...
Scraping ARBM...
Scraping ARCH...
Scraping ARCL...
Scraping ARTC...
Failed to fetch data for ARTC: 429
Scraping ARTH...
Failed to fetch data for ARTH: 429
Scraping ARTS...
Failed to fetch data for ARTS: 429
Scraping ASIA...
Failed to fetch data for ASIA: 429
Scraping ASIC...
Failed to fetch data for ASIC: 429
Scraping ASL...
Failed to fetch data for ASL: 429
Scraping ASLA...
Failed to fetch data for ASLA: 429
Scraping ASTR...
Failed to fetch data for ASTR: 429
Scraping ASTU...
Failed to fetch data for ASTU: 429
Scraping ATSC...
Failed to fetch data for ATSC: 429
Scraping AUDI...
Failed to fetch data for AUDI: 429
Scraping BEST...
Failed to fetch data for BEST: 429
Scraping BIOC...
Failed to fetch data for BIOC: 429
Scraping BIOL...
Failed to fetch data for BIOL: 429
Scraping BIOT...
Failed to fetch data for BIOT: 429
Scraping BMEG...
F

In [44]:
all_courses_copy = all_courses[:]
fail_courses_copy = fail_courses[:]

In [45]:
len(fail_courses_copy)

114

In [46]:
import time

In [47]:
tem = 1
fail_course_len = len(fail_courses_copy)
while fail_courses_copy != []:
    dept = fail_courses_copy.pop(0)

    if fail_course_len - len(fail_courses_copy) == 0: 
        print("time out")
        break

    print(f"Scraping {dept}...[fail course remain: {len(fail_courses_copy)}]")
    courses, fail = scrape_department_courses(dept)
    all_courses_copy.extend(courses)
    fail_courses_copy.extend(fail)
    if tem % 12 == 0: 
        time.sleep(10)
        fail_course_len = len(fail_courses_copy)
    tem += 1

Scraping ARTC...[fail course remain: 113]
Scraping ARTH...[fail course remain: 112]
Scraping ARTS...[fail course remain: 111]
Scraping ASIA...[fail course remain: 110]
Scraping ASIC...[fail course remain: 109]
Scraping ASL...[fail course remain: 108]
Scraping ASLA...[fail course remain: 107]
Scraping ASTR...[fail course remain: 106]
Scraping ASTU...[fail course remain: 105]
Scraping ATSC...[fail course remain: 104]
Scraping AUDI...[fail course remain: 103]
Scraping BEST...[fail course remain: 102]
Scraping BIOC...[fail course remain: 101]
Scraping BIOL...[fail course remain: 100]
Failed to fetch data for BIOL: 429
Scraping BIOT...[fail course remain: 100]
Failed to fetch data for BIOT: 429
Scraping BMEG...[fail course remain: 100]
Failed to fetch data for BMEG: 429
Scraping BUSI...[fail course remain: 100]
Failed to fetch data for BUSI: 429
Scraping CAPS...[fail course remain: 100]
Failed to fetch data for CAPS: 429
Scraping CDST...[fail course remain: 100]
Failed to fetch data for CDS

In [48]:
# Convert to DataFrame and save
df_courses = pd.DataFrame(all_courses)

In [49]:
df_courses['department'] = df_courses['course_code'].str.split().str[0]
df_courses['department'] = df_courses['department'].str.replace('_V', '')
df_courses['course_code'] = df_courses['course_code'].str.split().str[1].astype(int)

In [50]:
df_courses = df_courses[["department", "course_code", "course_name", "credits", "prerequisites", "corequisites", "description"]]

In [51]:
ubc_courses_df = df_courses

# Redefine the parsing function with handling for brackets as "all_of"
def parse_prerequisite(prerequisite_string):
    """
    Converts a human-readable prerequisite string into a Prerequisite object representation.
    """
    if not isinstance(prerequisite_string, str) or not prerequisite_string.strip():
        return None

    prerequisite_string = prerequisite_string.lower()

    # Handle simple course requirements (e.g., "CPSC 221")
    single_course_match = re.match(r"([a-z]+ \d+)", prerequisite_string, re.IGNORECASE)
    if single_course_match:
        return f'Prerequisite(type="course", details="{single_course_match.group(1).upper()}")'

    # Handle "one of" and "all of" scenarios
    if "one of" in prerequisite_string:
        courses = re.findall(r"([a-z]+ \d+)", prerequisite_string, re.IGNORECASE)
        parsed_courses = ", ".join([f'Prerequisite(type="course", details="{course.upper()}")' for course in courses])
        return f'Prerequisite(type="one_of", children=[{parsed_courses}])'

    if "all of" in prerequisite_string:
        courses = re.findall(r"([a-z]+ \d+)", prerequisite_string, re.IGNORECASE)
        parsed_courses = ", ".join([f'Prerequisite(type="course", details="{course.upper()}")' for course in courses])
        return f'Prerequisite(type="all_of", children=[{parsed_courses}])'

    # Handle brackets as "all_of" scenario (e.g., "[CPSC 220, CPSC 111]")
    bracket_match = re.findall(r"\[([^\]]+)\]", prerequisite_string)
    if bracket_match:
        courses = re.findall(r"([a-z]+ \d+)", bracket_match[0], re.IGNORECASE)
        parsed_courses = ", ".join([f'Prerequisite(type="course", details="{course.upper()}")' for course in courses])
        return f'Prerequisite(type="all_of", children=[{parsed_courses}])'

    # Handle credit requirements (e.g., "6 credits of MATH or STAT")
    credit_match = re.match(r"(\d+) credits of ([a-z]+( or [a-z]+)*)", prerequisite_string, re.IGNORECASE)
    if credit_match:
        credit_value = credit_match.group(1)
        departments = [dept.strip().upper() for dept in credit_match.group(2).split("or")]
        return f'Prerequisite(type="credits", details={{"min_credits": {credit_value}, "departments": {departments}}})'

    # Handle fallback (return original string as a comment for manual refinement)
    return f'/* Unparsed: {prerequisite_string} */'

# Apply the updated parsing logic
ubc_courses_df["prerequisites"] = ubc_courses_df["prerequisites"].apply(parse_prerequisite)
ubc_courses_df["corequisites"] = ubc_courses_df["corequisites"].apply(parse_prerequisite)

ubc_courses_df

Unnamed: 0,department,course_code,course_name,credits,prerequisites,corequisites,description
0,ACAM,250,Asian Canadians in Popular Culture,3,,,Popular culture's role in the production of As...
1,ACAM,300,Dis/Orienting Asian Canada,3,,,"The histories, cultures, social dynamics, and ..."
2,ACAM,310,Asian Canadian Cultural Studies,3,,,Examines Asian Canadian communities and cultur...
3,ACAM,350,Asian Canadian Community-Based Media,3,,,Digital media production in the context of com...
4,ADHE,327,Teaching Adults,3,,,"Planning, conducting and evaluating instructio..."
...,...,...,...,...,...,...,...
1988,WRDS,390,Writing Disability,3,"Prerequisite(type=""one_of"", children=[Prerequi...",,"Explores Disability theories, to understand th..."
1989,WRDS,400,Writing and Communication Capstone,3,"Prerequisite(type=""course"", details=""WRDS 200"")",,Critical assessment of approaches to writing a...
1990,WRDS,450,Writing Practices for Publics & Publications,3,"Prerequisite(type=""one_of"", children=[Prerequi...",,Writing processes and strategies conducive to ...
1991,WRIT,98,Preparation for University Writing and the LPI,0,,,This course is not eligible for Credit/D/Fail ...


In [52]:
df_courses.to_csv("data/ubc_courses.csv", index=False)
print("Course data saved to ubc_courses.csv.")

Course data saved to ubc_courses.csv.
