In [1]:
from bs4 import BeautifulSoup
import requests
import re
import os
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

prompt = "The answer to this question should be a Prerequisites Document. Respond only with a Prerequisites Document and nothing else."
format = ""
examples = ""
with open("prerequisitesFormat.txt") as file:
    format = file.read()
with open("prerequisiteExamples.txt") as file:
    format = file.read()


secret = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=secret)



### SCRAPING LIST OF COURSES ###

def get_courses(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-courses-view view-id-courses_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    courses = []

    i = 0
    

    for child_div in child_divs:
        i += 1
        d = {}

        try:
            # Extract course name
            course_name = child_div.find('div', {'aria-label': True}).get_text(strip=True)

            # Extract description
            description = child_div.find('div', class_='views-field-field-desc').get_text(strip=True)

            # Extract prerequisites
            prerequisites = child_div.find('span', class_='views-field-field-prerequisite').get_text(strip=True)
            old = prerequisites
            # print(prerequisites)
            prerequisites = f"{prompt} {format} You will be given a text description of prerequisites, taken from the course requirements page from the University of Toronto. It is your job to decide which courses a student will need to take. Here are some examples of expected values given an input of prerequisites: {examples}. Here is the prerequisites: {prerequisites[14:]}. Remember, ONLY respond using the Prerequisites format, and NOTHING ELSE.",
            prerequisites = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": prerequisites[0]}],
                    }
                ],
            )
            prerequisites = prerequisites.choices[0].message.content

            d["name"] = course_name
            d["description"] = description
            d["prerequisites"] = prerequisites
            courses.append(d)
            print(d, old)
        except:
            pass

    return courses


def getHTMLCourses(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_courses(html)
    

# courses = getHTMLCourses('https://utm.calendar.utoronto.ca/section/Mathematical-Sciences')
# print(courses)

In [None]:
import traceback
from bs4 import BeautifulSoup
import requests
import re
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

prompt = "The answer to this question should be a CourseRequirements Document. Respond only with a CourseRequirements Document and nothing else."
format = ""
with open("courseRequirementsFormat.txt") as file:
    format = file.read()

secret = os.getenv("OPENAI_API_KEY")


### GET LIST OF PROGRAMS ###

client = OpenAI(api_key=secret)

def get_requirements(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-programs-view view-id-programs_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    programs = []
    child_divs = [child_divs[1]]
    for child_div in child_divs:
        d = {}

        try:
            completion_section = child_div.find('h3', class_='views-label views-label-field-completion-req')

            # Get all content after the "Completion Requirements" section
            content_after_completion = completion_section.find_next().find_all_next(['p', 'ol'])

            # Filter for <p> followed directly by <ol>
            filtered_content = []
            for i in range(len(content_after_completion) - 1):
                if content_after_completion[i].name == 'p' and content_after_completion[i + 1].name == 'ol':
                    # Add the text from <p> and <ol> to the filtered content
                    filtered_content.append(content_after_completion[i].get_text())  # Text from <p>
                    
                    # Add the text from each <li> inside <ol>, preserving newlines
                    for li in content_after_completion[i + 1].find_all('li'):
                        filtered_content.append(li.get_text())

            # Join the filtered content as a single string with a newline between each item
            plaintext = "\n".join(filtered_content)

            # Print the plain text content with whitespace preserved
            content = f"{prompt} {format} You will be given a text description of course requirements, taken from the course requirements page from the University of Toronto. It is your job to decide which courses a student will need to take. If you are unsure which course to pick, use the fallback course code, which is the empty string. Here is the course requirements page: {plaintext}. Remember, ONLY respond using the CourseRequirements format, and NOTHING ELSE.",
            chat_completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": [{"type": "text", "text": content[0]}],
                    }
                ],
                
            )
            print(chat_completion.choices[0].message.content)

        except Exception:
            traceback.print_exc()

        programs.append(d)
        # break # just getting the first one in the list
    return programs


def getHTMLPrograms(url: str): 
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_requirements(html)

# ourses = getHTMLPrograms('https://utm.calendar.utoronto.ca/section/Computer-science')
# print(courses)

In [3]:
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
import os

CHUNKS = 1 # Depending on the size of the input data, this number may need to be increased due to batch size limits

def create_model(courses):
    
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    try:
        collection =  chroma_client.get_collection("vector_earch")
    except: 
        chroma_client.reset()
        collection = chroma_client.create_collection(name="vector_search")

    documents = []
    metadatas = []
    ids = []
    id = 1

    for course in courses:
        documents.append(course["description"])
        metadatas.append({'item_id': course["name"], "prerequisites": course["prerequisites"]})
        ids.append(f'id{id}')
        id += 1

    t = len(documents) // CHUNKS

    for i in range(0, len(documents), t):
        collection.add(
            documents=documents[i:i+t],
            metadatas=metadatas[i:i+t],
            ids=ids[i:i+t]
    )

In [1]:
def query_courses(query: str):
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    collection =  chroma_client.get_collection("vector_search")
    n = 10
    results = collection.query(
        query_texts=[query],
        n_results=n
    )

    return [{"name": results["metadatas"][0][i]["item_id"], "description": results["documents"][0][i], "prerequisites": results["metadatas"][0][i]["prerequisites"]} for i in range(0, n)]


In [None]:
courses_db = getHTMLCourses('https://utm.calendar.utoronto.ca/section/Mathematical-sciences')

{'name': 'MAT102H5 • Introduction to Mathematical Proofs', 'description': 'Understanding, using and developing precise expressions of mathematical ideas, including definitions and theorems. Set theory, logical statements and proofs, induction, topics chosen from combinatorics, elementary number theory, Euclidean geometry.', 'prerequisites': '{\n\tyear1: ["MHF4U"],\n\tyear2: [],\n\tyear3: [],\n\tyear4: []\n}'} Prerequisites:Minimum 70% in Grade 12 Advanced Functions (MHF4U)
{'name': 'MAT132H5 • Differential Calculus for Life Sciences', 'description': 'Review of functions and their graphs, trigonometry, exponentials and logarithms. Limits and continuity of functions of a single variable. Derivatives and differentiation techniques. Applications of differentiation, including extreme values, related rates and optimization. Life science applications are emphasized.', 'prerequisites': '{\n\tyear1: ["MHF4U"],\n\tyear2: [],\n\tyear3: [],\n\tyear4: []\n}'} Prerequisites:Minimum 70% in Grade 12 A

In [None]:
import ast

def search_db(name: str, courses_db):
    for course in courses_db:
        if course["name"][:8] == name:
            return ast.literal_eval(course["prerequisites"])
    return []


def get_full_trajectory(query, courses_db):
    lst = []
    interest_courses = query_courses(query)
    for course in interest_courses:
        lst.append(course["name"][:8])
    return get_prereqs(lst, courses_db)


def get_prereqs(courses, courses_db):
    d = {}
    for course in courses:
        prereqs = search_db(course, courses_db)
        d[course] = get_prereqs(prereqs, courses_db)
            
    return d


get_full_trajectory('https://utm.calendar.utoronto.ca/section/Mathematical-sciences', "Linear algebra")

In [2]:
from flask import Flask, abort, request, session

app = Flask(__name__)
app.secret_key = 'Pathway'

@app.route("/set_courses", methods=["POST"])
def set_courses():
    program_type = request.form.get('program_type')
    session['courses'] = getHTMLCourses(f'https://utm.calendar.utoronto.ca/section/{program_type}')
    return "Success"

@app.route("/get_courses", methods=["GET"])
def get_crsz():
    try:
        return session['courses']
    except:
        abort(500)
    

@app.route("/set_collection", methods=["POST"])
def set_collection():
    create_model(session['courses'])
    return "Success"


@app.route("/interest_query", methods=["POST"])
def interest_query():
    interest_query = request.form.get('query')
    print(interest_query)
    return query_courses(interest_query)


@app.route("/interest_timeline", methods=["POST"])
def interest_timeline():
    interest_query = request.form.get('query')
    return get_full_trajectory(interest_query, session['courses'])


if __name__ == '__main__':
    app.run(port=5328)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5328
Press CTRL+C to quit
