In [None]:
from bs4 import BeautifulSoup
import requests
import re
import os
from openai import OpenAI
from dotenv import load_dotenv
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from chromadb.config import Settings
import os
import ast

load_dotenv()

examples = ""
with open("prerequisiteExamples.txt") as file:
    examples = file.read()


secret = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=secret)



### SCRAPING LIST OF COURSES ###

def get_courses(html: bytes):
    soup = BeautifulSoup(html, 'html.parser')
    div = soup.find('div', class_=re.compile('^w3-row view view-courses-view view-id-courses_view'))
    view_content_div = div.find('div', class_=re.compile('view-content'))
    child_divs = view_content_div.find_all('div', recursive=False)
    courses = []

    for child_div in child_divs:
        d = {}

        try:
            # Extract course name
            course_name = child_div.find('div', {'aria-label': True}).get_text(strip=True)

            # Extract description
            description = child_div.find('div', class_='views-field-field-desc').get_text(strip=True)

            # Extract prerequisites
            prerequisites = child_div.find('span', class_='views-field-field-prerequisite').get_text(strip=True)
            prerequisites = f"{examples}. The following is an unformatted string containing course codes that you will format: {prerequisites[14:]}. Remember, ONLY respond using the given format rules, and nothing else.",
            prerequisites = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {
                        "role": "user",
                        "content": prerequisites[0],
                    }
                ],
            )
            prerequisites = prerequisites.choices[0].message.content

            d["name"] = course_name
            d["description"] = description
            d["prerequisites"] = prerequisites
            courses.append(d)
            print(d)
        except:
            pass

    return courses


def getHTMLCourses(url: str): 
    print(url)
    response = requests.get(url) 

    if response.status_code != 200:
        print("Error getting HTML")
    else:
        html = response.content
        return get_courses(html)


CHUNKS = 1 # Depending on the size of the input data, this number may need to be increased due to batch size limits

def create_model(courses):
    
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    try:
        collection =  chroma_client.get_collection("vector_earch")
    except: 
        chroma_client.reset()
        collection = chroma_client.create_collection(name="vector_search")

    documents = []
    metadatas = []
    ids = []
    id = 1

    for course in courses:
        documents.append(course["description"])
        metadatas.append({'item_id': course["name"], "prerequisites": course["prerequisites"]})
        ids.append(f'id{id}')
        id += 1

    t = len(documents) // CHUNKS

    for i in range(0, len(documents), t):
        collection.add(
            documents=documents[i:i+t],
            metadatas=metadatas[i:i+t],
            ids=ids[i:i+t]
    )
    

def query_courses(query: str):
    chroma_client = chromadb.PersistentClient(settings=Settings(allow_reset=True))
    collection =  chroma_client.get_collection("vector_search")
    n = 5
    results = collection.query(
        query_texts=[query],
        n_results=n
    )
    print(results)

    return [{"name": results["metadatas"][0][i]["item_id"], "description": results["documents"][0][i], "prerequisites": results["metadatas"][0][i]["prerequisites"]} for i in range(0, n)]

def resolve_data(years, interest_courses):
    d = {1: [], 2: [], 3: [], 4: []}
    for i in range(1, 5):
        for code in years[i]:
            added = False
            for i_course in interest_courses:
                if code == i_course["name"][:8]:
                    d[i].append(i_course)
                    added = True
                    break
            if added == False:
                d[i].append({"name": code})
    return d




def search_db(name: str, courses_db):
    for course in courses_db:
        if course["name"][:8] == name:
            return ast.literal_eval(course["prerequisites"])
    return []


def get_full_trajectory(query, courses_db):
    lst = []
    interest_courses = query_courses(query)
    for course in interest_courses:
        lst.append(course["name"][:8])
    return resolve_data(categorize_courses(get_prereqs(lst, courses_db)), interest_courses)


def get_prereqs(courses, courses_db):
    d = {}
    for course in courses:
        prereqs = search_db(course, courses_db)
        d[course] = get_prereqs(prereqs, courses_db)
            
    return d


def categorize_courses(course_dict, categorized_courses={}):
    for course, prereqs in course_dict.items():
        year = course[3] 
        year_label = int(year)
        if year_label not in categorized_courses:
            categorized_courses[year_label] = []
        if course not in categorized_courses[year_label]:
            categorized_courses[year_label].append(course)
        categorize_courses(prereqs, categorized_courses) 

    return categorized_courses

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)
CORS(app, supports_credentials=True)

@app.route("/set_courses", methods=["POST"])
def set_courses():
    program_type = request.form.get('program_type')
    courses = getHTMLCourses(f'https://utm.calendar.utoronto.ca/section/{program_type}')
    create_model(courses)
    return courses


@app.route("/set_collection", methods=["POST"])
def set_collection():
    create_model(request.form.get("courses"))
    return jsonify(success=True)


import json
@app.route("/interest_timeline", methods=["POST"])
def interest_timeline():
    courses_db = json.loads(request.form.get("courses"))
    interest_query = request.form.get('query')
    return get_full_trajectory(interest_query, courses_db)


if __name__ == '__main__':
    app.run(port=5328)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5328
Press CTRL+C to quit
127.0.0.1 - - [15/Feb/2025 22:05:39] "POST /interest_timeline HTTP/1.1" 200 -


{'ids': [['id15', 'id39', 'id31', 'id40', 'id41']], 'embeddings': None, 'documents': [['An introduction to methods for automated learning of relationships on the basis of empirical data. Classification and regression using nearest neighbour methods, decision trees, linear models, and neural networks. Clustering algorithms. Problems of overfitting and of assessing accuracy. Basics of reinforcement learning.', 'An introduction to neural networks and deep learning. Backpropagation and automatic differentiation. Architectures: convolutional networks and recurrent neural networks. Methods for improving optimization and generalization. Neural networks for unsupervised and reinforcement learning.', 'Theories and algorithms that capture (or approximate) some of the core elements of computational intelligence. Topics include: search, logical representations and reasoning, classical automated planning, representing and reasoning with uncertainty, learning, decision making (planning) under uncert

127.0.0.1 - - [15/Feb/2025 22:06:09] "POST /interest_timeline HTTP/1.1" 200 -


{'ids': [['id15', 'id39', 'id31', 'id40', 'id41']], 'embeddings': None, 'documents': [['An introduction to methods for automated learning of relationships on the basis of empirical data. Classification and regression using nearest neighbour methods, decision trees, linear models, and neural networks. Clustering algorithms. Problems of overfitting and of assessing accuracy. Basics of reinforcement learning.', 'An introduction to neural networks and deep learning. Backpropagation and automatic differentiation. Architectures: convolutional networks and recurrent neural networks. Methods for improving optimization and generalization. Neural networks for unsupervised and reinforcement learning.', 'Theories and algorithms that capture (or approximate) some of the core elements of computational intelligence. Topics include: search, logical representations and reasoning, classical automated planning, representing and reasoning with uncertainty, learning, decision making (planning) under uncert