In [1]:
from bs4 import BeautifulSoup
import json
import numpy as np
import pandas as pd

import re
import os

In [2]:
def get_courses_info(soup, major_name=None):
    rows = soup.find("table", {"class": "datadisplaytable"}).find("tbody").find_all("tr", recursive=False)
    title_rows = rows[::2]
    course_detail_rows = rows[1::2]

    course_titles = [row.find("th", {"class": "ddlabel"}).find("a").text for row in title_rows]
    course_descriptions = [row.find("td", {"class": "dddefault"}).text.split("View Catalog Entry")[0] for row in course_detail_rows]

    all_class_info = []
    for course_info_data in course_detail_rows:
        class_info_data = course_info_data.find("table", {"class": "datadisplaytable"}).find("tbody")

        headers = [x.text for x in class_info_data.find_all("th")]
        class_info_dict = {header: [] for header in headers}
        for row in class_info_data.find_all("tr")[1:]:
            for i, cell in enumerate(row.find_all("td")):
                key = headers[i]
                class_info_dict[key].append(cell.text)

        class_info_df = pd.DataFrame(class_info_dict)

        all_class_info.append(class_info_df)

    course_docs = []
    for course_title, course_description, class_info_df in zip(course_titles, course_descriptions, all_class_info):

        if "Master Thesis" in course_title or "Dissertation" in course_title:
            continue

        if major_name is not None:
            doc = major_name + " courses" + "\n"
        else:
            doc = ""
        doc += course_title + "\n" + course_description + "\n" + class_info_df.to_markdown()
        
        # Remove multiple new lines
        doc = re.sub(r'\n+', '\n', doc)
        course_docs.append(doc)
    
    return course_docs


In [170]:
all_course_docs = []
code2name = {
    "DA": "Data Analytics",
    "CS": "Computer Science",
    "ECON": "Economics",
    "IE": "Industrial Engineering",
}

for file_name in os.listdir("data/sabanci_course_pages/"):
    file_path = os.path.join("data/sabanci_course_pages", file_name)
    course_code = file_name.split("_")[0]
    term = file_name.split("_")[2].split(".")[0]

    with open(file_path, 'r') as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html.parser')

    major_name = code2name.get(course_code, None)
    course_texts = get_courses_info(soup, major_name)
    
    course_docs = []
    for course_text in course_texts:
        course_doc = {
            "major_code": course_code,
            "term": term,
            "content": course_text
        }
        course_docs.append(course_doc)
    all_course_docs.extend(course_docs)
len(all_course_docs)

386

In [3]:
# with open("data/sabanci_course_docs.json", "w") as f:
#     json.dump(all_course_docs, f, indent=4)

with open("data/sabanci_course_docs.json", "r") as f:
    data = json.load(f)

In [6]:
df = pd.DataFrame(data)
df["file_name"] = df["major_code"] + "_" + df["term"]

In [7]:
for file_name in df["file_name"].unique():
    course_docs = df[df["file_name"] == file_name]["content"].values
    with open(f"data/sabanci_course_docs/{file_name}.txt", "w") as f:
        f.write("\n\n".join(course_docs))

In [4]:
concated_text = "\n\n".join([x["content"] for x in data])

In [8]:
with open("data/sabanci_course_docs.txt", "w") as f:
    f.write(concated_text.strip())