In [None]:
from bs4 import BeautifulSoup
import requests
import re

### SCRAPING LIST OF COURSES ###

def get_courses(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-courses-view view-id-courses_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    courses = []

    for child_div in child_divs:
        d = {}

        try:
            # Extract course name
            course_name = child_div.find('div', {'aria-label': True}).get_text(strip=True)

            # Extract description
            description = child_div.find('div', class_='views-field-field-desc').get_text(strip=True)

            # Extract prerequisites
            prerequisites = child_div.find('span', class_='views-field-field-prerequisite').get_text(strip=True)

            d["name"] = course_name
            d["description"] = description
            d["prerequisites"] = prerequisites
        except:
            pass

        courses.append(d)

    return courses


def getHTML(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_courses(html)
    

courses = getHTML('https://utm.calendar.utoronto.ca/section/Mathematical-Sciences')
print(courses)

In [None]:
import traceback
from bs4 import BeautifulSoup
import requests
import re
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

prompt = "The answer to this question should be a CourseRequirements Document. Respond only with a CourseRequirements Document and nothing else."
format = ""
with open("courseRequirementsFormat.txt") as file:
    format = file.read()

secret = os.getenv("OPENAI_API_KEY")


### GET LIST OF PROGRAMS ###

client = OpenAI(api_key=secret)

def get_requirements(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-programs-view view-id-programs_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    programs = []
    child_divs = [child_divs[-2]]
    for child_div in child_divs:
        d = {}

        try:
            completion_section = child_div.find('h3', class_='views-label views-label-field-completion-req')

            # Get all content after the "Completion Requirements" section
            content_after_completion = completion_section.find_next().find_all_next(['p', 'ol'])

            # Filter for <p> followed directly by <ol>
            filtered_content = []
            for i in range(len(content_after_completion) - 1):
                if content_after_completion[i].name == 'p' and content_after_completion[i + 1].name == 'ol':
                    # Add the text from <p> and <ol> to the filtered content
                    filtered_content.append(content_after_completion[i].get_text())  # Text from <p>
                    
                    # Add the text from each <li> inside <ol>, preserving newlines
                    for li in content_after_completion[i + 1].find_all('li'):
                        filtered_content.append(li.get_text())

            # Join the filtered content as a single string with a newline between each item
            plaintext = "\n".join(filtered_content)

            # Print the plain text content with whitespace preserved
            content = f"{prompt} {format} You will be given a text description of course requirements, taken from the course requirements page from the University of Toronto. It is your job to decide which courses a student will need to take. If you are unsure which course to pick, use the fallback course code, which is the empty string. Here is the course requirements page: {plaintext}. Remember, ONLY respond using the CourseRequirements format, and NOTHING ELSE.",
            chat_completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": content[0]}],
                    }
                ],
                
            )
            print(chat_completion.choices[0].message.content)

        except Exception:
            traceback.print_exc()

        programs.append(d)
        break # just getting the first one in the list


def getHTML(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_requirements(html)

courses = getHTML('https://utm.calendar.utoronto.ca/section/Computer-science')
print(courses)

sk-qbg7SB2ODv33ssCzc4syKK40ycQcxDiHb3z8ZR8mp3T3BlbkFJ76B0rGh1En685kvVz0sm8jSMaadR5pDXLYqk3HPWgA
{
	year1: ["CSC108H5", "CSC148H5", "ISP100H5", "MAT102H5", "MAT132H5", "MAT134H5"],
	year2: ["CSC207H5", "CSC236H5", "CSC209H5", "MAT223H5", "STA246H5"],
	year3: ["CSC369H5"],
	year4: [""]
}
None
