In [6]:
from bs4 import BeautifulSoup
import requests
import re
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

prompt = "The answer to this question should be a Preerquisites Document. Respond only with a Prerequisites Document and nothing else."
format = ""
with open("prerequisitesFormat.txt") as file:
    format = file.read()


secret = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=secret)



### SCRAPING LIST OF COURSES ###

def get_courses(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-courses-view view-id-courses_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    courses = []

    i = 0
    

    for child_div in child_divs:
        i += 1
        d = {}

        try:
            # Extract course name
            course_name = child_div.find('div', {'aria-label': True}).get_text(strip=True)

            # Extract description
            description = child_div.find('div', class_='views-field-field-desc').get_text(strip=True)

            # Extract prerequisites
            prerequisites = child_div.find('span', class_='views-field-field-prerequisite').get_text(strip=True)
            prerequisites = f"{prompt} {format} You will be given a text description of prerequisites, taken from the course requirements page from the University of Toronto. It is your job to decide which courses a student will need to take. If you are unsure which course to pick, use the fallback course code, which is the empty string. IF you see a course code that is not 8 characters long, it is not a valid, and you should use the fallback course code. Here is the prerequisites: {prerequisites[14:]}. Remember, ONLY respond using the Prerequisites format, and NOTHING ELSE. Prerequisites seperated by 'or' do not both need to be taken, however, prerequisites seperated by 'and' must include both.",
            prerequisites = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": prerequisites[0]}],
                    }
                ],
            )
            prerequisites = prerequisites.choices[0].message.content

            d["name"] = course_name
            d["description"] = description
            d["prerequisites"] = prerequisites
            courses.append(d)
        except:
            pass

    return courses


def getHTMLCourses(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_courses(html)
    

courses = getHTMLCourses('https://utm.calendar.utoronto.ca/section/Mathematical-Sciences')
print(courses)

[{'name': 'MAT102H5 • Introduction to Mathematical Proofs', 'description': 'Understanding, using and developing precise expressions of mathematical ideas, including definitions and theorems. Set theory, logical statements and proofs, induction, topics chosen from combinatorics, elementary number theory, Euclidean geometry.', 'prerequisites': '["MHF4U"]'}, {'name': 'MAT132H5 • Differential Calculus for Life Sciences', 'description': 'Review of functions and their graphs, trigonometry, exponentials and logarithms. Limits and continuity of functions of a single variable. Derivatives and differentiation techniques. Applications of differentiation, including extreme values, related rates and optimization. Life science applications are emphasized.', 'prerequisites': '["MHF4U"]'}, {'name': 'MAT133Y5 • Calculus and Linear Algebra for Commerce', 'description': 'Mathematics of finance, matrices and linear equations. Review of differential calculus; applications. Integration and fundamental theor

In [None]:
import traceback
from bs4 import BeautifulSoup
import requests
import re
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

prompt = "The answer to this question should be a CourseRequirements Document. Respond only with a CourseRequirements Document and nothing else."
format = ""
with open("courseRequirementsFormat.txt") as file:
    format = file.read()

secret = os.getenv("OPENAI_API_KEY")


### GET LIST OF PROGRAMS ###

client = OpenAI(api_key=secret)

def get_requirements(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-programs-view view-id-programs_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    programs = []
    child_divs = [child_divs[-2]]
    for child_div in child_divs:
        d = {}

        try:
            completion_section = child_div.find('h3', class_='views-label views-label-field-completion-req')

            # Get all content after the "Completion Requirements" section
            content_after_completion = completion_section.find_next().find_all_next(['p', 'ol'])

            # Filter for <p> followed directly by <ol>
            filtered_content = []
            for i in range(len(content_after_completion) - 1):
                if content_after_completion[i].name == 'p' and content_after_completion[i + 1].name == 'ol':
                    # Add the text from <p> and <ol> to the filtered content
                    filtered_content.append(content_after_completion[i].get_text())  # Text from <p>
                    
                    # Add the text from each <li> inside <ol>, preserving newlines
                    for li in content_after_completion[i + 1].find_all('li'):
                        filtered_content.append(li.get_text())

            # Join the filtered content as a single string with a newline between each item
            plaintext = "\n".join(filtered_content)

            # Print the plain text content with whitespace preserved
            content = f"{prompt} {format} You will be given a text description of course requirements, taken from the course requirements page from the University of Toronto. It is your job to decide which courses a student will need to take. If you are unsure which course to pick, use the fallback course code, which is the empty string. Here is the course requirements page: {plaintext}. Remember, ONLY respond using the CourseRequirements format, and NOTHING ELSE.",
            chat_completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": content[0]}],
                    }
                ],
                
            )
            print(chat_completion.choices[0].message.content)

        except Exception:
            traceback.print_exc()

        programs.append(d)
        break # just getting the first one in the list
    return programs


def getHTMLPrograms(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_requirements(html)

courses = getHTML('https://utm.calendar.utoronto.ca/section/Computer-science')
print(courses)

{
	year1: ["CSC108H5", "CSC148H5", "ISP100H5", "MAT102H5", "MAT132H5", "MAT135H5", "MAT137H5", "MAT157H5", "MAT134H5", "MAT136H5", "MAT139H5", "MAT159H5", "MAT137Y5", "MAT157Y5", "MAT233H5"],
	year2: ["CSC207H5", "CSC236H5", "CSC209H5", "CSC258H5", "CSC263H5", "MAT223H5", "MAT240H5", "STA246H5", "STA256H5", "ECO227Y5"],
	year3: ["CSC369H5", "CSC311H5", "CSC338H5", "CSC347H5", "CSC376H5", "GGR335H5", "GGR337H5", "GGR437H5"],
	year4: [""]
}
[{}]


In [2]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
import os

CHUNKS = 1 # Depending on the size of the input data, this number may need to be increased due to batch size limits

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="vector_search")

import csv

documents = []
metadatas = []
ids = []
id = 1

courses = getHTMLCourses('https://utm.calendar.utoronto.ca/section/Computer-science')

for course in courses:
    documents.append(course["description"])
    metadatas.append({'item_id': course["name"]})
    ids.append(f'id{id}')
    id += 1

t = len(documents) // CHUNKS

for i in range(0, len(documents), t):
    collection.add(
        documents=documents[i:i+t],
        metadatas=metadatas[i:i+t],
        ids=ids[i:i+t]
    )

In [9]:
results = collection.query(
    query_texts=["I want to learn machine learning"],
    n_results=5
)

print(results)

{'ids': [['id1', 'id39', 'id34', 'id55', 'id53']], 'embeddings': None, 'documents': [['This course is intended for students in the Bioinformatics Specialist degree program. Possible areas in which the research may take place include: functional genomics (e.g., microarray and proteomic data analysis); systems biology; and the development of novel analytical methods for large datasets. Students will be required to produce a written document of their project and present it orally. In order to enrol in this course, students must obtain, several months in advance, approval from a faculty member(s) who will serve as supervisor(s).', 'An introduction to neural networks and deep learning. Backpropagation and automatic differentiation. Architectures: convolutional networks and recurrent neural networks. Methods for improving optimization and generalization. Neural networks for unsupervised and reinforcement learning.', 'This course involves a significant literature search and expository work in