In [None]:
#script for data processing and filteration

import requests
import csv
import re
import pandas as pd

term = 'FA24'

# Load exam_df -- future user: make sure to load it in!
exam_df = pd.read_excel("2024-08-30_MidAdd_ToSend_YYO.xlsx")

# Account for repeats: create a set of unique course ids from the "Course ID" column
unique_exam_course_ids = set(exam_df["Course ID"].unique())
#Create a df that contains associated course (course name ex. MATH2940) for each course id
#unique_exam_courses = exam_df.groupby("Course ID")["Course"].first().reset_index()

# Endpoints
subjects_url = f'https://classes.cornell.edu/api/2.0/config/subjects.json?roster={term}'
classes_url = 'https://classes.cornell.edu/api/2.0/search/classes.json'

response = requests.get(subjects_url)
if response.status_code == 200:
    subjects_data = response.json()
    subjects = [subject['value'] for subject in subjects_data.get('data', {}).get('subjects', [])]

    with open(f'all_subjects_{term}_courses_requisites.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Course ID', 'Prereq/Coreq'])

        for subject in subjects:
            params = {
                'roster': term,
                'subject': subject
            }
            response = requests.get(classes_url, params=params)
            if response.status_code == 200:
                classes = response.json().get('data', {}).get('classes', [])

                for course in classes:
                    course_id = course.get('crseId', 'N/A')
                    #print (course_id): it is.
                    # To check membership, adjust as needed:
                    if course_id not in unique_exam_course_ids:
                        continue
                    prereq_coreq = course.get('catalogPrereqCoreq', 'None')
                    writer.writerow([course_id, prereq_coreq])
            else:
                print(f'Failed to retrieve courses for subject {subject}: HTTP {response.status_code}')

    print(f'Data for all subjects has been written to all_subjects_{term}_courses_requisites.csv')
else:
    print(f'Failed to retrieve subject list: HTTP {response.status_code}')


Failed to retrieve courses for subject CHERO: HTTP 404
Data for all subjects has been written to all_subjects_FA24_courses_requisites.csv


In [None]:
#check if CSV we wrote (and returned) from above was correct
#output_csv now stores the newly returned CSV
import pandas as pd

output_csv = 'all_subjects_FA24_courses_requisites.csv'

df = pd.read_csv(output_csv)
print (df)

     Course ID                                       Prereq/Coreq
0       352628                                                NaN
1       367086                                                NaN
2       350500                                                NaN
3       350518                                                NaN
4       366462                                                NaN
..         ...                                                ...
718     352526  Prerequisite: BIOG 1500 and BIOG 1440 or BIOG ...
719     361232             Recommended prerequisite: BIOAP 4130. 
720     359148                                                NaN
721     359821                          Prerequisite: YORUB 1109.
722     366105  Prerequisite: ZULU 2117 or permission of instr...

[723 rows x 2 columns]


In [None]:
#Parse 'Prereq/Coreq' (treats "OR" like "AND" for now)

def parse_prerequisites(prereq_str):

    # Remove common prefixes and extraneous punctuation.
    #ANYTHING ELSE I AM MISSING?
    cleaned = prereq_str.replace("Prerequisite:", "") \
                        .replace("Corequisite:", "") \
                        .replace("Prereq:", "") \
                        .replace("Coreq:", "") \
                        .replace("(", "") \
                        .replace(")", "")
    cleaned = cleaned.strip()

    # Regex (to remove "OR" or "AND")
    course_codes = re.findall(r'([A-Z]{2,}\s*\d{3,})', cleaned)
    # Remove any spaces, make all capitalized
    finished_codes = [code.replace(" ", "").upper() for code in course_codes]
    return finished_codes

#Just in case above code block is deleted in future (after checking), redefine output_csv
output_csv = 'all_subjects_FA24_courses_requisites.csv'

In [None]:
#exam mapping to map course to its course id:
exam_mapping = {}
for idx, row in exam_df.iterrows():
    # Removing '-' and extra spaces; also use upper-case to ensure consistency.
    subject = str(row["Subject"]).strip().upper()
    catalog = str(row["Catalog"]).strip()
    course_name = f"{subject}-{catalog}"
    exam_mapping[course_name.replace("-", "")] = row["Course ID"]

# Build a dictionary where:
# - Each key is a course_id (from the "course_id" column)
# - Each value is a list of prerequisite course ids (parsed from the "Prereq/Coreq" column)
prereq_dict = {}

with open(output_csv, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        course_id = row['Course ID']
        prereq_str = row['Prereq/Coreq']
        prerequisites = parse_prerequisites(prereq_str)
        prereq_dict[course_id] = prerequisites

#matches courses in prerequites to their course_ids and updates dictionary
updated_prereq_dict = {}
for course, prereq_list in prereq_dict.items():
    updated_list = []
    for prereq in prereq_list:
        # Normalize the extracted prerequisite course: remove dash (if present) and convert to upper-case.
        normalized_prereq = prereq.replace("-", "").strip().upper()
        if normalized_prereq in exam_mapping:
            updated_list.append(exam_mapping[normalized_prereq])
        # If not found, do nothing (i.e. drop the prerequisite)
    updated_prereq_dict[course] = updated_list

# For consistency, convert all keys and values to strings in updated_dictionary and remove the '.0' from prerequisites
def normalize_course_id(cid):
    """
    Convert a course id to a string and, if it ends with '.0', remove that trailing part.
    """
    cid_str = str(cid).strip()
    if cid_str.endswith(".0"):
        cid_str = cid_str[:-2]
    return cid_str

# Normalize the updated_prereq_dict so that keys and values do not have trailing ".0"
updated_prereq_dict = {
    normalize_course_id(course): [normalize_course_id(pr) for pr in prereq_list]
    for course, prereq_list in updated_prereq_dict.items()
}

print(updated_prereq_dict)

{'352628': [], '367086': [], '350500': [], '350518': [], '366462': [], '351509': ['350500', '350518'], '367367': [], '364480': ['350025', '352116'], '350563': [], '351500': ['350025'], '351920': ['350500', '364480'], '351969': ['350500'], '368693': ['350518', '366462', '350500'], '361193': [], '361619': [], '361620': [], '364181': [], '369779': ['350025', '350038', '352116'], '352811': ['350500', '352116'], '354538': [], '358518': [], '358517': [], '357472': ['355146', '355147', '355207', '352257', '352263'], '353818': ['355146', '355147', '370289'], '353824': ['355208', '355210', '370289'], '370289': [], '353828': ['353824', '370289'], '355226': [], '355230': ['355213', '353824'], '350010': [], '351395': [], '365511': [], '371419': [], '368622': [], '354657': [], '368928': [], '370009': [], '373983': [], '368996': [], '373273': [], '352154': ['357658'], '352180': [], '352191': ['352133'], '371726': ['352154'], '354567': ['352133', '363758'], '372651': [], '350234': [], '363836': [], '

In [None]:

#Creating initial adjacency matrix (maps direct pairs of course_ids)
import numpy as np

all_courses = set()
for course, prereq_list in updated_prereq_dict.items():
    all_courses.add(course)
    for prereq in prereq_list:
        all_courses.add(prereq)

all_courses = sorted(all_courses)
n = len(all_courses)

# Create a mapping from each course ID to an index.
course_index = {cid: idx for idx, cid in enumerate(all_courses)}

#Build a directed Adjacency Matrix
adj_matrix = np.zeros((n, n), dtype=int)
for course, prereq_list in updated_prereq_dict.items():
    if course in course_index:
        j = course_index[course]  # target course index
        for prereq in prereq_list:
            if prereq in course_index:
                i = course_index[prereq]  # prerequisite course index
                adj_matrix[i, j] = 1

# Set reflexive edges as 1 (self loops)
np.fill_diagonal(adj_matrix, 1)


In [None]:
#use Floyd-Warshall to compute transitive closure
reach = adj_matrix.copy()
for k in range(n):
    for i in range(n):
        for j in range(n):
            if reach[i, k] and reach[k, j]:
                reach[i, j] = 1

# ---------------------------
# Step 4: Construct the Conflict Matrix
# ---------------------------
conflict_matrix = np.zeros((n, n), dtype=int)
for i in range(n):
    for j in range(n):
        if reach[i, j] or reach[j, i]:
            conflict_matrix[i, j] = 1

conflict_df = pd.DataFrame(conflict_matrix, index=all_courses, columns=all_courses)
print("Conflict Matrix:")
print(conflict_df)
conflict_df.to_csv("conflict_matrix.csv", index=True)


Conflict Matrix:
        350007  350010  350025  350038  350152  350154  350163  350167  \
350007       1       0       0       0       0       0       0       0   
350010       0       1       0       0       0       0       0       0   
350025       0       0       1       0       0       0       0       0   
350038       0       0       0       1       0       0       0       0   
350152       0       0       0       0       1       0       0       0   
...        ...     ...     ...     ...     ...     ...     ...     ...   
374096       0       0       0       0       0       0       0       0   
374120       0       0       0       0       0       0       0       0   
374130       0       0       0       0       0       0       0       0   
374197       0       0       0       0       0       0       0       0   
374270       0       0       0       0       0       0       0       0   

        350173  350174  ...  374009  374021  374040  374062  374092  374096  \
350007       0 