In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json

In [2]:
# Read list from a JSON file
with open('data/department_list.json', 'r') as file:
    department_list = json.load(file)

In [3]:
# Base URL for course descriptions
BASE_URL = "https://vancouver.calendar.ubc.ca/course-descriptions/subject/"

# Function to scrape courses for a department
def scrape_department_courses(department_code):
    url = f"{BASE_URL}{department_code}v"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for {department_code}: {response.status_code}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all course blocks (using updated selectors based on the provided HTML structure)
    courses = []
    for course in soup.find_all("article", class_="node node--type-course node--promoted node--view-mode-teaser"):
        # Extract course code, name, and credit hours from the h3 tag
        header = course.find("h3")
        if header:
            course_details = header.text.strip()
            # Regex to match the course code, credit, and course name
            match = re.match(r"(?P<code>[A-Z]+_V \d+) \((?P<credits>\d+)\)\s+(?P<name>.+)$", course_details)
            if match:
                course_code = match.group('code')
                course_name = match.group('name')
                credits = int(match.group('credits'))
            else:
                continue
        
        # Extract course description and prerequisites from the p tag
        description_block = course.find("p")
        if description_block:
            description_text = description_block.text.strip()
            # Regex to extract prerequisites text
            prereq_match = re.search(r"Prerequisite[s]*: (.+?)(?:\s[Cc]orequisite[s]*:|$)", description_text)
            prerequisites = prereq_match.group(1) if prereq_match else None
            coreq_match = re.search(r"Corequisite[s]*: (.+)", description_text, re.IGNORECASE)
            corequisites = coreq_match.group(1) if coreq_match else None

            # Append course data
            courses.append({
                "course_code": course_code,
                "course_name": course_name,
                "credits": credits,
                "description": description_text,
                "prerequisites": prerequisites,
                "corequisites": corequisites
            })

    return courses

# Scrape multiple departments
departments = ["CPSC", "MATH"]  # Add more department codes as needed(!!!, replace the full course list after finish future design)
all_courses = []
for dept in departments:
    print(f"Scraping {dept}...")
    courses = scrape_department_courses(dept)
    all_courses.extend(courses)

Scraping CPSC...
Scraping MATH...


In [8]:
# Convert to DataFrame and save
df_courses = pd.DataFrame(all_courses)

In [9]:
df_courses

Unnamed: 0,course_code,course_name,credits,description,prerequisites,corequisites
0,CPSC_V 100,Computational Thinking,3,Meaning and impact of computational thinking. ...,,
1,CPSC_V 103,Introduction to Systematic Program Design,3,Computation as a tool for systematic problem s...,,
2,CPSC_V 107,Systematic Program Design,3,Fundamental computation and program structures...,CPSC 103.,
3,CPSC_V 110,"Computation, Programs, and Programming",4,Fundamental program and computation structures...,,
4,CPSC_V 121,Models of Computation,4,Physical and mathematical structures of comput...,Principles of Mathematics 12 or Pre-calculus 12.,"One of CPSC 107, CPSC 110."
...,...,...,...,...,...,...
242,MATH_V 567,Nonlinear Wave Equations,3,This course is not eligible for Credit/D/Fail ...,,
243,MATH_V 589,M.Sc. Major Essay,3,This course is not eligible for Credit/D/Fail ...,,
244,MATH_V 591,Graduate Seminar in Applied Mathematics,2,This course is not eligible for Credit/D/Fail ...,,
245,MATH_V 599,Mathematics Teaching Techniques,1,[3-0-0] This course is not eligible for Credit...,,


In [10]:
df_courses['department'] = df_courses['course_code'].str.split().str[0]
df_courses['department'] = df_courses['department'].str.replace('_V', '')
df_courses['course_code'] = df_courses['course_code'].str.split().str[1].astype(int)

In [11]:
df_courses = df_courses[["department", "course_code", "course_name", "credits", "prerequisites", "corequisites", "description"]]

In [12]:
def parse_requirements_safe(text, max_depth=10, current_depth=0):
    """
    A safer version of the parsing function that limits recursion depth.
    """
    if current_depth > max_depth:
        return [[f"Exceeded max depth: {text}"]]

    if pd.isna(text):
        return []

    # Step 1: Normalize the text
    text = text.lower()
    text = re.sub(r'\s*\b(or|and)\b\s*', r' \1 ', text)  # Normalize logical operators
    text = re.sub(r'\s*\b(one of)\b\s*', r' \1 ', text)
    text = re.sub(r'[\(\)]', '', text)  # Remove parentheses for simplicity initially

    # Step 2: Parse "either (a)... or (b)..." structures
    if "either" in text:
        parts = re.split(r'\s*either\s+', text)
        branches = []
        for part in parts:
            if part.strip():
                branches.append(parse_requirements_safe(part.strip(), max_depth, current_depth + 1))
        return branches

    # Step 3: Parse "one of X, Y, Z"
    if "one of" in text:
        options = re.split(r',|\bor\b', text.split("one of", 1)[1])
        return [[opt.strip()] for opt in options if opt.strip()]

    # Step 4: Quantitative or positional constraints
    if re.search(r'\bcredits\b|\bstanding\b', text):
        return [[f"Ambiguous: {text.strip()}"]]

    # Step 5: Split by "and" for simple conjunctions
    parts = re.split(r'\band\b', text)
    parsed = []
    for part in parts:
        subparts = re.split(r'\bor\b', part)
        parsed.append([sub.strip() for sub in subparts if sub.strip()])
    return parsed

# Apply the safer parsing function to a smaller subset for debugging
sample_df = df_courses.head(10)  # Process only the first 10 rows to debug
sample_df['parsed_prerequisites'] = sample_df['prerequisites'].apply(
    lambda x: parse_requirements_safe(x, max_depth=10)
)
sample_df['parsed_corequisites'] = sample_df['corequisites'].apply(
    lambda x: parse_requirements_safe(x, max_depth=10)
)

sample_df


# # Apply the transformation to both prerequisites and corequisites columns
# df_courses['parsed_prerequisites'] = df_courses['prerequisites'].apply(parse_requirements)
# df_courses['parsed_corequisites'] = df_courses['corequisites'].apply(parse_requirements)

# df_courses['parsed_corequisites']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['parsed_prerequisites'] = sample_df['prerequisites'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['parsed_corequisites'] = sample_df['corequisites'].apply(


Unnamed: 0,department,course_code,course_name,credits,prerequisites,corequisites,description,parsed_prerequisites,parsed_corequisites
0,CPSC,100,Computational Thinking,3,,,Meaning and impact of computational thinking. ...,[],[]
1,CPSC,103,Introduction to Systematic Program Design,3,,,Computation as a tool for systematic problem s...,[],[]
2,CPSC,107,Systematic Program Design,3,CPSC 103.,,Fundamental computation and program structures...,[[cpsc 103.]],[]
3,CPSC,110,"Computation, Programs, and Programming",4,,,Fundamental program and computation structures...,[],[]
4,CPSC,121,Models of Computation,4,Principles of Mathematics 12 or Pre-calculus 12.,"One of CPSC 107, CPSC 110.",Physical and mathematical structures of comput...,"[[principles of mathematics 12, pre-calculus 1...","[[cpsc 107], [cpsc 110.]]"
5,CPSC,203,"Programming, Problem Solving, and Algorithms",3,"One of CPSC 103, CPSC 110, APSC 160, EOSC 211,...",,Analysis of increasingly complex algorithmic p...,"[[cpsc 103], [cpsc 110], [apsc 160], [eosc 211...",[]
6,CPSC,210,Software Construction,4,"One of CPSC 107, CPSC 110.",,"Design, development, and analysis of robust so...","[[cpsc 107], [cpsc 110.]]",[]
7,CPSC,213,Introduction to Computer Systems,4,"All of CPSC 121, CPSC 210.",,"Software architecture, operating systems, and ...","[[all of cpsc 121, cpsc 210.]]",[]
8,CPSC,221,Basic Algorithms and Data Structures,4,"One of CPSC 210, CPEN 221 and either (a) one o...",,Design and analysis of basic algorithms and da...,"[[[cpsc 210], [cpen 221 and]], [[cpsc 121], [m...",[]
9,CPSC,259,Data Structures and Algorithms for Electrical ...,4,APSC 160.,,Advanced procedural programming. Fundamental a...,[[apsc 160.]],[]


In [13]:
df_courses.to_csv("data/ubc_courses.csv", index=False)
print("Course data saved to ubc_courses.csv.")

Course data saved to ubc_courses.csv.


In [1]:
print(1)
# 111111111111111111

1
