In [75]:
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import os

# Step 1: Open File

In [76]:
fileName = input("What is the name of the file (format: SemesterNameYY.html): ")
x = 0
while fileName not in os.listdir("workday-data") and x < 3:
  fileName = input("{} not found. Try again".format(fileName))
  x += 1
if x >= 3 and fileName not in os.listdir("workday-data"):
  raise Exception("File not found")
print("Success! fileName = {}".format(fileName))

Success! fileName = Spring 24.html


In [77]:
file_path = "workday-data/" + fileName

with open(file_path, "r") as file:
    content = file.read()

soup = BeautifulSoup(content, "html.parser")
semester = fileName.replace(".html", "")
# WOVQ WPUQ WHWQ
# data-automation-id= ["promptOption", "menuItem", "compositeSubHeaderOne", ...]
# promptOption: course, schedule, schedule-details, classmode, prqs
# menuItem: schedule, schedule-details, classmode, prqs
# compositeSubHeaderOne: instructor
div_store = {"Spring 25" : "WGVF WEVF",\
             "Fall 24" : "WJVF WHVF",
             "Spring 24" : "WJVF WHVF",
             "Fall 23" : "WJVF WHVF",
             "Spring 23" : "WJVF WHVF",
             "Fall 22" : "WJVF WHVF"}
# found these manually, may be a way to find them automatically?
# seemingly, either Workday changed the code for last spring specifically, 
# or archived datacodes are stored differently as opposed to recent datacodes
course_classes = soup.find_all(attrs={"class": div_store[semester]})

# Step 2: Parse File for Courses

In [78]:
# Each course is stored under <div class="WGVF WEVF"
# this class changes per webpage, but here is wgvf wevf
# WJVF WHVF

id_lod = []
for course_data in tqdm(course_classes):
  # grab course from this div
  all_ids = course_data.find_all(attrs={"id" : lambda x: x and "gwt-uid-" in x})
  x = 0
  id_dict = {}
  for element in all_ids:
    # course should be lowest id
    # remove the prompt option:
    uid = int(element.attrs["id"].replace("promptOption-", "").replace("gwt-uid-",""))

    if element.text not in id_dict.values():
      id_dict[uid] = element.text

  id_lod.append(id_dict)


# PRQS and other section details are written somewhere else, under same div

100%|██████████| 1076/1076 [00:00<00:00, 5306.67it/s]


In [79]:
# use this to test individual indexes
# for item in id_lod:
#   print(id_lod.index(item), len(item))
id_lod[895]

{2382: 'PSY 509-03 - Supervised Research',
 2383: 'Open   |   Veronica Flores   |   In-Person   |   1/0',
 5967: 'Independent Study',
 5968: 'PRQ - Instructor permission required'}

# Step 3: Sort by UID

In [80]:
id_df = pd.DataFrame([], index=["gwt-uid", "info", "courseIdx"])
for idx in range(len(id_lod)):
  # print(idx)
  dict_lst = [id_lod[idx].keys(), id_lod[idx].values(), [idx]*len(id_lod[idx])]
  id_df = pd.concat([id_df, pd.DataFrame(dict_lst, index=["gwt-uid", "info", "courseIdx"])], axis=1)
id_df = id_df.T
id_df
# id_lod[0]

Unnamed: 0,gwt-uid,info,courseIdx
0,4,ACC 120-01 - Principles of Financial and Manag...,0
1,6,Closed | Mark Bettner | In-Person | ...,0
2,5,HIP-202 | TR | 8:30 AM - 9:45 AM,0
3,2915,Lecture,0
4,2916,PRQ - (No credit for ACC 111 or ACC 301) AND (...,0
...,...,...,...
2,2912,PYR-144 | TR | 1:00 PM - 2:15 PM,1075
3,6697,"IDS :: Medicine, Health and Culture",1075
4,6698,IDS :: Science Education,1075
5,6699,"IDS :: Womenâ€™s, Gender, and Sexuality Studies",1075


In [82]:
id_lod[0]

{4: 'ACC 120-01 - Principles of Financial and Managerial Accounting',
 6: 'Closed   |   Mark Bettner   |   In-Person   |   24/24',
 5: 'HIP-202 | TR | 8:30 AM - 9:45 AM',
 2915: 'Lecture',
 2916: 'PRQ - (No credit for ACC 111 or ACC 301) AND (First-Year or Sophomore Class Standing)'}

In [81]:
id_df = id_df.reset_index(drop=True).sort_values(by="gwt-uid", ascending=True)

In [83]:
num_classes = id_df["courseIdx"].max() + 1
num_classes

1076

# Step 4: Convert *Info* to usable data (Course: Instructor: Section)



In [84]:
# min = id_df["gwt-uid"].min()
min_cond = id_df["courseIdx"] == 1
# the idea is that the minimum value in the next course is always higher than smaller values
# in the first course, but it is smaller than the larger values in the first course
threshold = id_df[min_cond]["gwt-uid"].min()
upper_limit = id_df["gwt-uid"].max()

for id in id_df[id_df["courseIdx"]==0]["gwt-uid"]:
  if id > threshold:
    if id < upper_limit:
      upper_limit = id
upper_limit

2915

In [87]:
# verify the upper limit is accurate manually
id_df[id_df["courseIdx"]==0]

Unnamed: 0,gwt-uid,info,courseIdx
0,4,ACC 120-01 - Principles of Financial and Manag...,0
2,5,HIP-202 | TR | 8:30 AM - 9:45 AM,0
1,6,Closed | Mark Bettner | In-Person | ...,0
3,2915,Lecture,0
4,2916,PRQ - (No credit for ACC 111 or ACC 301) AND (...,0


In [197]:
course_cond = id_df["courseIdx"] == 0
lower_cond = id_df["gwt-uid"] < upper_limit
for item in id_df[course_cond].values:
  print(item)

[4 'ACC 111-01 - Principles of Financial Accounting' 0]
[5 'HIP-202 | TR | 11:30 AM - 12:45 PM' 0]
[6 'Closed   |   Mark Bettner   |   In-Person   |   27/24' 0]
[2854 'Lecture' 0]
[2855 'PRQ - First Year or Sophomore Class Standing' 0]


In [88]:
# with all these things sorted by gwt-uid, let's assign courses, schedule, instructor
# class_len = len(id_lod)
d = []
lower_cond  = id_df["gwt-uid"] < upper_limit
try:
  for course_idx in range(num_classes):
    course_cond = id_df["courseIdx"] == course_idx
    course_values = id_df[lower_cond & course_cond]

    course_scale = len(course_values)
    if course_scale == 3:
      course, schedule, instructor = [value for value in course_values["info"].values]
    elif course_scale == 2:
      course, instructor = [value for value in course_values["info"].values]
      schedule = "empty | empty | empty"

    # deconstructing course
    course_result = course.split("-")
    course_result = [x.strip() for x in course_result]
    if len(course_result) <= 3:
      courseCode, courseSection, courseTitle = course_result
    elif len(course_result) > 3:
      # in case the title has dashes in it
      courseCode, courseSection, courseTitle  = (course_result[0], course_result[1], "-".join((course_result[2:])))

    # deconstructing schedule (if it exists)
    # the [-1] checks if there is a date attached (some courses do that)
    schedule_result = schedule.split(" | ")
    if len(schedule_result) == 3 and "/"  not in schedule_result[-1]:
      roomId, days, time                      = schedule_result
    elif len(schedule_result) == 3 and "/" in schedule_result[-1]:
      days, time                              = schedule_result[0], schedule_result[1]
      roomId = "empty"

    elif len(schedule_result) == 4 and "/" in schedule_result[-1]:
      roomId, days, time                      = (schedule_result[0], schedule_result[1], schedule_result[2])

    # deconstructing instructor
    instructor_result = instructor.split(" | ")
    if len(instructor_result) == 4:
      cStatus, insName, cMode, cFill          = [x.strip() for x in instructor_result]
    else:
      cStatus, cMode, cFill                   = [x.strip() for x in instructor_result]
      insName = "empty"

    # deal with this later
    detail_values = id_df[course_cond & ~lower_cond]
    other = detail_values["info"].values
    tags, prqs, cluster, others = ["empty"], ["empty"], ["empty"], ["empty"]
    format = "empty"

    for detail in other:
      if detail in ["Lecture", "Laboratory", "Seminar", "Studio", "Independent Study",\
                    "Experiential", "Discussion", "Internship", "Combination"]:
        format = detail

      # specifically in Fall 23, they formatted "AR - GER" for FYW prereqs
      elif detail.startswith("PRQ") or detail.startswith("AR - GER"):
        prqs.remove("empty") if "empty" in prqs else None
        if detail.startswith("PRQ"):
          prqs.append(detail.replace("PRQ - ", ""))
        else:
          prqs.append(detail.replace("AR - GER - ", ""))

      elif detail.startswith("GER") or detail.startswith("IDS") or \
      detail.startswith("Special Course") or detail.startswith("FYW topics"):
        tags.remove("empty") if "empty" in tags else None
        tags.append(detail)

      # clusters will be dealt with later


      else:
        others.remove("empty") if "empty" in others else None
        others.append(detail)

    d.append([courseCode, courseSection, courseTitle, [roomId], [days], [time],\
              cStatus, insName, cMode, cFill, tags, prqs, cluster, [format], others])

except Exception as e:
  print(e)
  print(course_idx)
d;

In [199]:
# in the dictionary keys, if the value is less than the length of id_lod (i.e. # classes), check if its a course, schedule, or instructor
# otherwise, check if its a schedule-details, courseTags, format, prqs, or course cluster
courses_df = pd.DataFrame(d, columns=["CourseCode", "Section", "CourseTitle", "RoomId", "Days", "Time", "Status", "InstructorName", "Mode", "Fill", "CourseTags", "Prerequisites", "CourseCluster", "InstructionalFormat", "Other"])
course = courses_df["CourseCode"] == "BIO 111"
# sect = courses_df["Section"] == "01"
# courses_df[course & sect]["Other"].values
courses_df.loc[course]

Unnamed: 0,CourseCode,Section,CourseTitle,RoomId,Days,Time,Status,InstructorName,Mode,Fill,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Other
49,BIO 111,01,Foundations of Biology,[PYR-028],[MWF],[11:30 AM - 12:20 PM],Open,Grace Freundlich,In-Person,23/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Lecture],[BIO 111-01L - Foundations of Biology]
50,BIO 111,01L,Foundations of Biology,[PYR-142],[W],[2:30 PM - 5:20 PM],Open,Grace Freundlich,In-Person,23/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Laboratory],[BIO 111-01 - Foundations of Biology]
51,BIO 111,02,Foundations of Biology,[PYR-126],[TR],[8:30 AM - 9:45 AM],Open,Travis Perry,In-Person,26/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Lecture],[BIO 111-02L - Foundations of Biology]
52,BIO 111,02L,Foundations of Biology,[PYR-142],[R],[2:30 PM - 5:20 PM],Open,Travis Perry,In-Person,26/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Laboratory],[BIO 111-02 - Foundations of Biology]
53,BIO 111,04,Foundations of Biology,[PYR-026],[TR],[10:00 AM - 11:15 AM],Open,David Hollis,In-Person,22/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Lecture],[BIO 111-04L - Foundations of Biology]
54,BIO 111,04L,Foundations of Biology,[PYR-142],[T],[2:30 PM - 5:20 PM],Open,David Hollis,In-Person,22/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],[empty],[Laboratory],[BIO 111-04 - Foundations of Biology]


In [None]:
# use this to test individual indexes
course_idx = int(input("# "))
courses_df[course_idx:course_idx+1]

# 4


Unnamed: 0,CourseCode,Section,CourseTitle,RoomId,Days,Time,Status,InstructorName,Mode,Fill,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Other
4,ACC 311,1,Intermediate Accounting I,[HIP-204],[MWF],[9:30 AM - 10:20 AM],Open,Sandy Roberson,In-Person,23/27,[empty],"[(ACC 111 or ACC 120) AND (First-Year, Sophomo...",[empty],[Lecture],[empty]


## Step 4.5: Create Prefixes, Locs to fill *CourseCluster*

In [200]:
prefix_lst = sorted(set(courses_df["CourseCode"].apply(lambda x: x.split(" ")[0])))
print(prefix_lst)

locations = sorted(set(courses_df["RoomId"].apply(lambda x: x[0].split("-")[0] if x != "empty" else x)))
locations.remove("empty")
print(locations)

['ACC', 'AFS', 'ANT', 'ART', 'AST', 'BIO', 'BUS', 'CHM', 'CHN', 'CLS', 'COM', 'CSC', 'DAN', 'ECN', 'EDU', 'EES', 'ENG', 'EST', 'FRN', 'FYW', 'GRK', 'GRM', 'HSC', 'HST', 'HUM', 'IDS', 'IGD', 'JPN', 'LAS', 'LNG', 'LTN', 'MES', 'MLL', 'MSL', 'MTH', 'MUS', 'NSC', 'PHL', 'PHY', 'POL', 'PSY', 'PTH', 'PVS', 'REL', 'SCI', 'SOC', 'SPN', 'SUS', 'THA', 'UNV', 'UST', 'WGS']
['BCM', 'CHP', 'DMB', 'FUR', 'HIP', 'HMP', 'JHN', 'LIB', 'PAC', 'PYR', 'RAB', 'RLY', 'RNK', 'SHI', 'TNS', 'TPH']


In [201]:
index = 0
cond = courses_df["Other"].apply(lambda x: True if x == ("empty") else False)
# cond = courses_df["CourseCode"]=="ACC 120"
courses_df[~cond]["CourseCluster"]
# courses_df.loc[0, "Other"][0]

0       [empty]
1       [empty]
2       [empty]
3       [empty]
4       [empty]
         ...   
1060    [empty]
1061    [empty]
1062    [empty]
1063    [empty]
1064    [empty]
Name: CourseCluster, Length: 1065, dtype: object

In [173]:
course = courses_df["CourseCode"] == "PSY 201"
sect = courses_df["Section"] == "01"
courses_df[course & sect]["CourseCluster"]

884    [empty]
Name: CourseCluster, dtype: object

# Step 5: Fill *CourseCluster* from remaining *Other* data

In [202]:
for idx in courses_df.index:
  # other = courses_df.loc[idx, "Other"]
  if courses_df.loc[idx, "Other"] != ["empty"]:
    store = []
    for item in courses_df.loc[idx, "Other"]:

      # courseCluster gets resolved here - only courses get added
      if item.startswith(tuple(prefix_lst)):
        courses_df.loc[idx, "CourseCluster"].remove("empty") if "empty" in courses_df.loc[idx, "CourseCluster"] else courses_df.loc[idx, "CourseCluster"]
        courses_df.loc[idx, "CourseCluster"].append(item)
        store.append(item)

      # some courses have two "schedules", this should resolve that
      elif item.startswith(tuple(locations)):
        # if there is a date there
        if len(item.split(" | ")) > 3:
          split = item.split(" | ")
          roomId, days, time = split[0], split[1], split[2]
        else:
          roomId, days, time = item.split(" | ")
        courses_df.loc[idx, "RoomId"].append(roomId)
        courses_df.loc[idx, "Days"].append(days)
        courses_df.loc[idx, "Time"].append(time)
        store.append(item)

      # but what if there is no roomId, muhahahaha
      elif item.startswith(("M", "T", "W", "R", "F")):
        days, time = item.split(" | ")
        roomId = "empty"
        courses_df.loc[idx, "RoomId"].append(roomId)
        courses_df.loc[idx, "Days"].append(days)
        courses_df.loc[idx, "Time"].append(time)
        store.append(item)

    for item in store:
      courses_df.loc[idx, "Other"].remove(item) if item in courses_df.loc[idx, "Other"] else courses_df.loc[idx, "Other"]
      if len(courses_df.loc[idx, "Other"]) == 0:
        courses_df.loc[idx, "Other"] = [("empty")]


courses_df;
    # print(other)

In [203]:
# ONLY RUN THIS ONCE
courses_df["CourseCluster"] = courses_df["CourseCluster"].apply(lambda x: ", ".join(x) if x != ["empty"] else "empty")
# courses_df["Other"] = courses_df["Other"].apply(lambda x: "\n".join(x)

In [204]:
course = courses_df["CourseCode"] == "BIO 111"
sect = courses_df["Section"] == "02"
courses_df[course & sect]#["CourseCluster"]

Unnamed: 0,CourseCode,Section,CourseTitle,RoomId,Days,Time,Status,InstructorName,Mode,Fill,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Other
51,BIO 111,2,Foundations of Biology,[PYR-126],[TR],[8:30 AM - 9:45 AM],Open,Travis Perry,In-Person,26/28,[GER - CORE :: NWL - Natural World with Lab (m...,[empty],BIO 111-02L - Foundations of Biology,[Lecture],[empty]


In [205]:
# check to make sure these are just courses
for item in courses_df[(courses_df['CourseCluster']!="empty")]["CourseCluster"]:
  print(item)

ANT 304-01L - Archaeology Method and Theory
ANT 304-01 - Archaeology Method and Theory
BIO 111-01L - Foundations of Biology
BIO 111-01 - Foundations of Biology
BIO 111-02L - Foundations of Biology
BIO 111-02 - Foundations of Biology
BIO 111-04L - Foundations of Biology
BIO 111-04 - Foundations of Biology
BIO 210-01L - Genetics
BIO 210-01 - Genetics
BIO 210-02L - Genetics
BIO 210-02 - Genetics
BIO 210-03L - Genetics
BIO 210-03 - Genetics
BIO 210-04L - Genetics
BIO 210-04 - Genetics
BIO 222-01L - Research and Analysis
BIO 222-01 - Research and Analysis
BIO 222-02L - Research and Analysis
BIO 222-02 - Research and Analysis
BIO 300-01L - Cell Biology
BIO 300-01 - Cell Biology
BIO 303-01L - Biochemistry of the Cell
BIO 303-01 - Biochemistry of the Cell
BIO 322-01L - Human Physiology
BIO 322-01 - Human Physiology
BIO 340-01L - Ecology
BIO 340-01 - Ecology
BIO 340-02L - Ecology
BIO 340-02 - Ecology
BIO 401-01L - Economic Botany
BIO 401-01 - Economic Botany
BIO 402-01L - Ethnobiology
BIO 402-0

In [206]:
is_empty = courses_df["Other"].apply(lambda x: False if x == [("empty")] else True)
courses_df[is_empty]

Unnamed: 0,CourseCode,Section,CourseTitle,RoomId,Days,Time,Status,InstructorName,Mode,Fill,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Other


In [207]:
large_sched_cond = courses_df["RoomId"].apply(lambda x: False if len(x) == 1 else True)
courses_df[large_sched_cond]

Unnamed: 0,CourseCode,Section,CourseTitle,RoomId,Days,Time,Status,InstructorName,Mode,Fill,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Other
72,BIO 320,01L,Animal Physiology,"[PYR-027, PYR-143]","[W, F]","[2:30 PM - 5:20 PM, 8:30 AM - 9:20 AM]",Open,Dennis Haney,In-Person,15/18,[IDS :: Neuroscience],[BIO 222 Research and Analysis],empty,[Laboratory],[empty]
80,BIO 401,01L,Economic Botany,"[PYR-144, PYR-144]","[T, R]","[11:30 AM - 12:45 PM, 2:30 PM - 5:20 PM]",Open,Ashley Morris,In-Person,9/14,[GER - Global Awareness :: NE (Humans and the ...,[BIO 101 OR BIO 102 OR BIO 111],BIO 401-01 - Economic Botany,[Laboratory],[empty]
197,CHN 202,01,Intermediate Chinese II,"[FUR-207, FUR-126]","[MRF, T]","[10:30 AM - 11:20 AM, 1:30 PM - 2:20 PM]",Open,Dongming Zhang,In-Person,2/24,"[GER - CORE :: FL (Foreign Language), IDS :: A...",[CHN 201 OR CHN PL 202],empty,[Lecture],[empty]
355,FYW 1127,01,To Walk the Land,"[PYR-108, PYR-108]","[T, R]","[1:00 PM - 2:15 PM, 1:00 PM - 5:30 PM]",Open,Ruth Aronoff,In-Person,6/12,"[FYW topics :: PE (Physical Environment), FYW ...",[empty],empty,[Seminar],[empty]
497,MLL 236,01,Nazi Cinema and Culture,"[FUR-214, FUR-229]","[T, TR]","[2:30 PM - 5:00 PM, 10:00 AM - 11:15 AM]",Closed,Ilka Rasch,In-Person,20/24,[GER - CORE :: VP (Human Cultures - Visual and...,[FYW (First Year Writing Seminar)],empty,[Lecture],[empty]
856,POL 150,01,Introduction to Political Analysis,"[JHN-109, JHN-203]","[TR, W]","[8:30 AM - 9:45 AM, 3:30 PM - 4:20 PM]",Open,David Fleming,In-Person,8/18,[GER - CORE :: HB (Empirical Study of Human Be...,[FYW (First Year Writing Seminar) AND (POL 101...,empty,[Lecture],[empty]
857,POL 150,02,Introduction to Political Analysis,"[JHN-109, JHN-203]","[TR, W]","[10:00 AM - 11:15 AM, 4:30 PM - 5:20 PM]",Open,David Fleming,In-Person,14/18,[GER - CORE :: HB (Empirical Study of Human Be...,[FYW (First Year Writing Seminar) AND (POL 101...,empty,[Lecture],[empty]
1054,THA 310,01,Directing,"[TPH-MTR, TPH-MTR]","[M, TR]","[2:30 PM - 5:20 PM, 1:00 PM - 2:15 PM]",Open,Maegan Azar,In-Person,6/12,[empty],[THA 120 Acting],empty,[Lecture],[empty]


In [208]:
if len(courses_df[is_empty]) == 0:
  courses_df = courses_df.drop(columns=["Other"])
  print("Success")
else:
  print("Uh oh")

Success


In [209]:
courses_df;

In [210]:
# find index of given course
courses_idx = int(input("#"))
courses_df[courses_idx:courses_idx+1][["CourseCode", "Section", "CourseTags", \
                                       "Prerequisites", "CourseCluster", "InstructionalFormat"]]

Unnamed: 0,CourseCode,Section,CourseTags,Prerequisites,CourseCluster,InstructionalFormat
45,AST 240,1,"[GER - CORE :: HA (Historical Analysis), GER -...",[empty],empty,[Lecture]


## Step 6: Send data to .csv for web-parsing

In [211]:
# first, make things into comma-delimited strings, not lists - lists complicate js a bit
# RUN ONCE
courses_df = courses_df.apply(lambda x: x.apply(lambda y: ", ".join(y) if type(y) == list else y))

In [212]:
# second, filter things we actually need:
courses = courses_df[["CourseCode", "Section", "CourseTitle", "CourseTags", "Prerequisites", "CourseCluster", "InstructionalFormat"]]
courses

Unnamed: 0,CourseCode,Section,CourseTitle,CourseTags,Prerequisites,CourseCluster,InstructionalFormat
0,ACC 111,01,Principles of Financial Accounting,empty,First Year or Sophomore Class Standing,empty,Lecture
1,ACC 111,03,Principles of Financial Accounting,empty,empty,empty,Lecture
2,ACC 111,04,Principles of Financial Accounting,empty,empty,empty,Lecture
3,ACC 301,BLK,Principles of Managerial Accounting,empty,ACC 111 AND ECN 111 AND (ECN 120 OR MTH 120) A...,empty,Lecture
4,ACC 311,01,Intermediate Accounting I,empty,ACC 111 Principles of Financial Accounting,empty,Lecture
...,...,...,...,...,...,...,...
1060,UNV TSGR,TS12,Germany-Travel Study,empty,empty,empty,Lecture
1061,UNV TSLN,TS8,England-Furman in London,empty,empty,empty,Experiential
1062,UST 501,01,Independent Study,empty,empty,empty,Independent Study
1063,WGS 230,01,Issues in Women's Gender and Sexuality Studies,"IDS :: Womenâ€™s, Gender, and Sexuality Studies",empty,empty,Lecture


In [213]:
# drop any repetitive lines
final_courses = courses.drop_duplicates()

In [214]:
len(final_courses), len(courses)

(1065, 1065)

In [None]:
file_path = "csv-files/{}".format(fileName.replace(".html",".csv"))
final_courses.to_csv(file_path)

## Step 7: Merge all .csv files into one workday-courses.csv file

In [None]:
file_lst = []
for file in os.listdir("csv-files"):
    if file != "programs.csv":
        file_df = pd.read_csv("csv-files/{}".format(file))
        file_df = file_df.drop(columns=["Unnamed: 0"])
        file_df["Semester"] = file.replace(".csv", "")
        file_lst.append(file_df)

print(len(file_lst[0]), len(file_lst[1]), len(file_lst[2]), len(file_lst[3]), len(file_lst[4]), len(file_lst[5]))
print(sum((\
            len(file_lst[0]),\
            len(file_lst[1]),\
            len(file_lst[2]),\
            len(file_lst[3]),\
            len(file_lst[4]),\
            len(file_lst[5])\
        )))
df = pd.concat(file_lst).sort_values(by="CourseCode").drop_duplicates(subset=['CourseCode', 'Section',\
                                                                              'CourseTitle', 'CourseTags', 'Prerequisites',\
                                                                              'CourseCluster', 'InstructionalFormat'])
print(len(df))
df[df["CourseCode"]=="ANT 101"]

1072 1098 1105 1065 1076 1121
6537
3022


Unnamed: 0,CourseCode,Section,CourseTitle,CourseTags,Prerequisites,CourseCluster,InstructionalFormat,Semester
9,ANT 101,3,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Fall 24
10,ANT 101,2,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,Instructor Permission Required,empty,Lecture,Spring 25
9,ANT 101,1,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,Instructor Permission Required,empty,Lecture,Spring 25
11,ANT 101,3,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Spring 23
10,ANT 101,2,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Spring 23
8,ANT 101,2,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Fall 24
7,ANT 101,1,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Fall 24
10,ANT 101,4,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Fall 24
9,ANT 101,1,Introduction to Anthropology,GER - CORE :: HB (Empirical Study of Human Beh...,empty,empty,Lecture,Spring 23


## Step 8: Pull out appropriate Prereqs and Course Tags

In [None]:
# fuck