In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json

In [3]:
# Read list from a JSON file
with open('data/department_list.json', 'r') as file:
    department_list = json.load(file)

In [53]:
# Base URL for course descriptions
BASE_URL = "https://vancouver.calendar.ubc.ca/course-descriptions/subject/"

# Function to scrape courses for a department
def scrape_department_courses(department_code):
    url = f"{BASE_URL}{department_code}v"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for {department_code}: {response.status_code}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all course blocks (using updated selectors based on the provided HTML structure)
    courses = []
    for course in soup.find_all("article", class_="node node--type-course node--promoted node--view-mode-teaser"):
        # Extract course code, name, and credit hours from the h3 tag
        header = course.find("h3")
        if header:
            course_details = header.text.strip()
            # Regex to match the course code, credit, and course name
            match = re.match(r"(?P<code>[A-Z]+_V \d+) \((?P<credits>\d+)\)\s+(?P<name>.+)$", course_details)
            if match:
                course_code = match.group('code')
                course_name = match.group('name')
                credits = int(match.group('credits'))
            else:
                continue
        
        # Extract course description and prerequisites from the p tag
        description_block = course.find("p")
        if description_block:
            description_text = description_block.text.strip()
            # Regex to extract prerequisites text
            prereq_match = re.search(r"Prerequisite[s]*: (.+?)(?:\s[Cc]orequisite[s]*:|$)", description_text)
            prerequisites = prereq_match.group(1) if prereq_match else None
            coreq_match = re.search(r"Corequisite[s]*: (.+)", description_text, re.IGNORECASE)
            corequisites = coreq_match.group(1) if coreq_match else None

            # Append course data
            courses.append({
                "course_code": course_code,
                "course_name": course_name,
                "credits": credits,
                "description": description_text,
                "prerequisites": prerequisites,
                "corequisites": corequisites
            })

    return courses

# Scrape multiple departments
departments = ["CPSC", "MATH"]  # Add more department codes as needed(!!!, replace the full course list after finish future design)
all_courses = []
for dept in departments:
    print(f"Scraping {dept}...")
    courses = scrape_department_courses(dept)
    all_courses.extend(courses)

Scraping CPSC...
Scraping MATH...


In [54]:
# Convert to DataFrame and save
df_courses = pd.DataFrame(all_courses)

In [55]:
df_courses

Unnamed: 0,course_code,course_name,credits,description,prerequisites,corequisites
0,CPSC_V 100,Computational Thinking,3,Meaning and impact of computational thinking. ...,,
1,CPSC_V 103,Introduction to Systematic Program Design,3,Computation as a tool for systematic problem s...,,
2,CPSC_V 107,Systematic Program Design,3,Fundamental computation and program structures...,CPSC 103.,
3,CPSC_V 110,"Computation, Programs, and Programming",4,Fundamental program and computation structures...,,
4,CPSC_V 121,Models of Computation,4,Physical and mathematical structures of comput...,Principles of Mathematics 12 or Pre-calculus 12.,"One of CPSC 107, CPSC 110."
...,...,...,...,...,...,...
242,MATH_V 567,Nonlinear Wave Equations,3,This course is not eligible for Credit/D/Fail ...,,
243,MATH_V 589,M.Sc. Major Essay,3,This course is not eligible for Credit/D/Fail ...,,
244,MATH_V 591,Graduate Seminar in Applied Mathematics,2,This course is not eligible for Credit/D/Fail ...,,
245,MATH_V 599,Mathematics Teaching Techniques,1,[3-0-0] This course is not eligible for Credit...,,


In [56]:
df_courses['department'] = df_courses['course_code'].str.split().str[0]
df_courses['department'] = df_courses['department'].str.replace('_V', '')
df_courses['course_code'] = df_courses['course_code'].str.split().str[1].astype(int)

In [57]:
print(df_courses.iloc[4].iloc[4])

Principles of Mathematics 12 or Pre-calculus 12.


In [36]:
df_courses = df_courses[["department", "course_code", "course_name", "credits", "prerequisites", "corequisites", "description"]]

In [136]:
df_courses[df_courses["department"] == "CPSC"]

Unnamed: 0,department,course_code,course_name,credits,prerequisites,corequisites,dependencies,description,prerequisites_description,corequisites_description
1451,CPSC,107,Systematic Program Design,3.0,[CPSC 103],[],"[CPSC 210, CPSC 330, DSCI 310]",Fundamental computation and program structures...,CPSC 103.,
1454,CPSC,203,"Programming, Problem Solving, and Algorithms",3.0,"[CPSC 103, CPSC 110, EOSC 211, MATH 210, PHYS ...",[],"[CPSC 330, CPSC 368, DSCI 310, DSCI 320, EOSC ...",Analysis of increasingly complex algorithmic p...,"One of CPSC 103, CPSC 110, EOSC 211, MATH 210,...",
1466,CPSC,404,Advanced Relational Databases,3.0,"[CPSC 304, CPSC 213, CPSC 261, CPEN 212]",[],[],"Physical database design, indexing, relational...","CPSC 304 and one of CPSC 213, CPSC 261, CPEN 212.",
1470,CPSC,455,Applied Industry Practices,3.0,"[CPSC 310, CPEN 321]",[],[],"Hands-on project, mentored by industry experts...","One of CPSC 310, CPEN 321.",
1489,CPSC,103,Introduction to Systematic Program Design,3.0,[],[],"[ATSC 212, ATSC 313, CPSC 107, CPSC 203, CPSC ...",Computation as a tool for systematic problem s...,,
...,...,...,...,...,...,...,...,...,...,...
1898,CPSC,410,Advanced Software Engineering,3.0,"[CPSC 310, CPEN 321, CPEN 331]",[],[],"Specification, design, construction and valida...","Either (a) CPSC 310 or (b) all of CPEN 321, CP...",
1899,CPSC,444,Advanced Methods for Human Computer Interaction,3.0,"[CPSC 344, STAT 200, STAT 201, STAT 203, STAT ...",[],[],Design and evaluation methodologies and theori...,"CPSC 344 and one of STAT 200, STAT 201, STAT 2...",
1927,CPSC,213,Introduction to Computer Systems,4.0,"[CPSC 121, CPSC 210]",[],"[CPSC 310, CPSC 313, CPSC 317, CPSC 404, CPSC ...","Software architecture, operating systems, and ...","All of CPSC 121, CPSC 210.",
1928,CPSC,436,Topics in Computer Science,,[],[],[],Selected topics in a specific area within Comp...,Third-year standing in a Computer Science or C...,


In [None]:
missing_data_counts_courses = df_courses[df_courses["department"] == "CPSC"].isnull().sum()
missing_data_counts_courses

department                    0
course_code                   0
course_name                   0
credits                       2
prerequisites                 0
corequisites                  0
dependencies                  0
description                   0
prerequisites_description    10
corequisites_description     60
dtype: int64

In [None]:
df_courses.to_csv("ubc_courses.csv", index=False)
print("Course data saved to ubc_courses.csv.")

In [5]:
# Optionally structure the data
course_dict = {course["code"]: course for course in all_courses}
course_dict

{'AMNE 356': {'preq': [],
  'creq': [],
  'depn': [],
  '_id': '62bed48b71b2809473f23a69',
  'dept': 'AMNE',
  'code': 'AMNE 356',
  'cred': 3,
  'link': 'https://courses.students.ubc.ca/cs/courseschedule?pname=subjarea&tname=subj-course&dept=AMNE&course=356',
  'name': 'Roman Religion',
  'desc': 'Roman religions between the ninth century BCE and the fourth century CE, including mystery religions, magic, emperor worship, and early Christianity, with particular attention devoted to the primary sources. Some knowledge of ancient Rome is recommended. Credit will be granted for only one of CLST 334 or AMNE 356.'},
 'AMNE 261': {'preq': [],
  'creq': [],
  'depn': [],
  '_id': '62bed48b71b2809473f23a49',
  'dept': 'AMNE',
  'code': 'AMNE 261',
  'cred': 3,
  'link': 'https://courses.students.ubc.ca/cs/courseschedule?pname=subjarea&tname=subj-course&dept=AMNE&course=261',
  'name': 'Eden to Exile: Introduction to the Hebrew Bible',
  'desc': 'An overview of the Hebrew Bible ("Old Testament"

In [None]:
# Save to a pickle file
with open("ubc_courses.pkl", "wb") as file:
    pickle.dump(course_dict, file)