In [12]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import json
from openai import OpenAI

In [13]:
# Read list from a JSON file
with open('data/department_list.json', 'r') as file:
    department_list = json.load(file)

In [14]:
# Base URL for course descriptions
BASE_URL = "https://vancouver.calendar.ubc.ca/course-descriptions/subject/"

# Function to scrape courses for a department
def scrape_department_courses(department_code):
    url = f"{BASE_URL}{department_code}v"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for {department_code}: {response.status_code}")
        return []

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all course blocks (using updated selectors based on the provided HTML structure)
    courses = []
    for course in soup.find_all("article", class_="node node--type-course node--promoted node--view-mode-teaser"):
        # Extract course code, name, and credit hours from the h3 tag
        header = course.find("h3")
        if header:
            course_details = header.text.strip()
            # Regex to match the course code, credit, and course name
            match = re.match(r"(?P<code>[A-Z]+_V \d+) \((?P<credits>\d+)\)\s+(?P<name>.+)$", course_details)
            if match:
                course_code = match.group('code')
                course_name = match.group('name')
                credits = int(match.group('credits'))
            else:
                continue
        
        # Extract course description and prerequisites from the p tag
        description_block = course.find("p")
        if description_block:
            description_text = description_block.text.strip()
            # Regex to extract prerequisites text
            prereq_match = re.search(r"Prerequisite[s]*: (.+?)(?:\s[Cc]orequisite[s]*:|$)", description_text)
            prerequisites = prereq_match.group(1) if prereq_match else None
            coreq_match = re.search(r"Corequisite[s]*: (.+)", description_text, re.IGNORECASE)
            corequisites = coreq_match.group(1) if coreq_match else None

            # Append course data
            courses.append({
                "course_code": course_code,
                "course_name": course_name,
                "credits": credits,
                "description": description_text,
                "prerequisites": prerequisites,
                "corequisites": corequisites
            })

    return courses

# Scrape multiple departments
departments = ["CPSC", "MATH"]  # Add more department codes as needed(!!!, replace the full course list after finish future design)
all_courses = []
for dept in departments:
    print(f"Scraping {dept}...")
    courses = scrape_department_courses(dept)
    all_courses.extend(courses)

Scraping CPSC...
Scraping MATH...


In [15]:
# Convert to DataFrame and save
df_courses = pd.DataFrame(all_courses)

In [16]:
df_courses['department'] = df_courses['course_code'].str.split().str[0]
df_courses['department'] = df_courses['department'].str.replace('_V', '')
df_courses['course_code'] = df_courses['course_code'].str.split().str[1].astype(int)

In [17]:
df_courses = df_courses[["department", "course_code", "course_name", "credits", "prerequisites", "corequisites", "description"]]

In [19]:
df_courses

Unnamed: 0,department,course_code,course_name,credits,prerequisites,corequisites,description
0,CPSC,100,Computational Thinking,3,,,Meaning and impact of computational thinking. ...
1,CPSC,103,Introduction to Systematic Program Design,3,,,Computation as a tool for systematic problem s...
2,CPSC,107,Systematic Program Design,3,CPSC 103.,,Fundamental computation and program structures...
3,CPSC,110,"Computation, Programs, and Programming",4,,,Fundamental program and computation structures...
4,CPSC,121,Models of Computation,4,Principles of Mathematics 12 or Pre-calculus 12.,"One of CPSC 107, CPSC 110.",Physical and mathematical structures of comput...
...,...,...,...,...,...,...,...
242,MATH,567,Nonlinear Wave Equations,3,,,This course is not eligible for Credit/D/Fail ...
243,MATH,589,M.Sc. Major Essay,3,,,This course is not eligible for Credit/D/Fail ...
244,MATH,591,Graduate Seminar in Applied Mathematics,2,,,This course is not eligible for Credit/D/Fail ...
245,MATH,599,Mathematics Teaching Techniques,1,,,[3-0-0] This course is not eligible for Credit...


In [24]:
from api import api_local

client = OpenAI(api_key=api_local, base_url="https://api.deepseek.com")

In [None]:
pre = "One of CPSC 103, CPSC 110, CPEN 221, EOSC 211, PHYS 210 and one of MATH 101, MATH 103, MATH 105, MATH 121, SCIE 001 and one of MATH 152, MATH 221, MATH 223."
prompt_text = f"Your task is to convert the following course prerequisites description into a nested list format.\n\
                Here are an example of the course prerequisites	description you may meet:\n\
                Here is the paragraph:\n \
                ~~~~~\n  \
                {pre}\n \
                ~~~~~\n\
                If any detail is find, please return them in the form of csv files. return None if an information is not find\n\
                Do not return any words that I don't ask you to return."

role = "You're a skilled instructional designer with a deep understanding of course structure and content organization. You excel at transforming course descriptions into clear, nested lists that enhance readability and comprehension for learners."

In [None]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "system", "content": role},
        {"role": "user", "content": prompt_text},
    ],
    stream=False
)

print(response.choices[0].message.content)

In [10]:
df_courses.to_csv("data/ubc_courses.csv", index=False)
print("Course data saved to ubc_courses.csv.")

Course data saved to ubc_courses.csv.
