In [199]:
from extract_data import run_etl
import pandas as pd

In [200]:
def update_names():
    # get all dataframes as a dictionary of (df, table_name)
    dataframes = run_etl()

    # Create a dictionary to hold the dataframes
    df_dict = {}
    for df, table_name in dataframes:
        df_dict[f"df_{table_name}"] = df
    return df_dict

In [201]:
# Change the working directory (If this is changed, the following cells will crash because it won't find the data folder)
import os

# Define the relative path
target_path = './../data'
move_path = "./../"

# Check if the directory exists before changing
if os.path.exists(target_path):
    os.chdir(move_path)
    print(f"Changed directory to: {os.getcwd()}")


In [202]:
df_dict = update_names()

display_table = "df_section" # Change to display table

In [203]:
# Remove rows with null values across all dataframes
df_dict["df_class"].dropna(inplace=True)
df_dict["df_section"].dropna(inplace=True)
df_dict["df_meeting"].dropna(inplace=True)
df_dict["df_room"].dropna(inplace=True)
df_dict["df_requisite"].dropna(inplace=True)

In [204]:

# Init the dataframes
df_class = df_dict["df_class"]
df_section = df_dict["df_section"]
df_meeting = df_dict["df_meeting"]
df_room = df_dict["df_room"]
df_requisite = df_dict["df_requisite"]

display(df_dict[display_table])

Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity
0,0,22,0,12,Spring,2025,22
1,1,15,12,23,Spring,2025,17
2,2,4,17,12,Spring,2025,42
3,3,9,8,9,Fall,2023,30
4,4,10,40,4,Spring,2022,15
...,...,...,...,...,...,...,...
1328,1328,23,12,12,V2,2017,42
1329,1329,7,19,13,Spring,2025,55
1330,1330,7,29,0,Spring,2018,31
1331,1331,17,14,0,Spring,2023,24


### 1. Ensure that classes have ID starting from 2

In [205]:
df_class["cid"] = pd.to_numeric(df_class["cid"], errors="coerce")
df_class.dropna(subset=["cid"], inplace=True)
df_class = df_class[df_class["cid"] >= 2]
df_section = df_section[df_section["cid"] >= 2]

display(df_dict["df_class"])

Unnamed: 0,cid,cname,ccode,cdesc,term,years,cred,csyllabus
0,2,CIIC,3015,Introduction to Computer Programming I,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
1,3,CIIC,3075,Foundations of Computing,"First Semester, Second Semester",Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
2,4,CIIC,3081,Computer Architecture I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
3,5,CIIC,4010,Advanced Programming,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
4,6,CIIC,4020,Data Structures,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
5,7,CIIC,4025,Analysis and Design of Algorithms,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
6,8,CIIC,4030,Programming Languages,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
7,9,CIIC,4050,Operating Systems,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
8,10,CIIC,4151,Senior Design Project I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
9,11,CIIC,4060,Database Systems,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...


#### 2. Two sections cannot be taught at the same hour in the same classroom. 
#### 3. A class cannot have the same section, they must be taught at different hours.

In [206]:
# Delete sections with duplicate 'sid'
df_section = df_section.drop_duplicates(subset=["sid"], keep=False)

# Merge 'section' with 'meeting' to check for overlapping sections in the same room, time, and semester
df_section_meeting = df_section.merge(df_meeting, on="mid")

# Convert 'starttime' and 'endtime' to time objects
df_section_meeting["starttime"] = pd.to_datetime(
    df_section_meeting["starttime"], format="%H:%M:%S"
).dt.time
df_section_meeting["endtime"] = pd.to_datetime(
    df_section_meeting["endtime"], format="%H:%M:%S"
).dt.time

# Sort the dataframe by room, semester, starttime, and sid
df_section_meeting = df_section_meeting.sort_values(
    ["roomid", "semester", "starttime", "sid"]
)

# Detect overlapping sections (same room, same semester, same time)
overlaps = []
for (roomid, semester, starttime, cdays), group in df_section_meeting.groupby(
    ["roomid", "semester", "starttime", "cdays"]
):  
    
    for i in range(1, len(group)):
        previous = group.iloc[i - 1]
        current = group.iloc[i]

        # Check if sections overlap in the same room, same time, and same semester
        if (
            current["starttime"] == previous["starttime"]
            and current["roomid"] == previous["roomid"]
            and current["semester"] == previous["semester"]
            and current["years"] == previous["years"]
            and current["cdays"] == previous["cdays"]
        ):
            # Add the section with the higher sid to the list of overlaps to delete
            if current["sid"] > previous["sid"]:
                overlaps.append(current["sid"])
            else:
                overlaps.append(previous["sid"])

# Remove overlapping sections with the higher sid
df_section = df_section[~df_section["sid"].isin(overlaps)]

display(df_dict["df_section"])

Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity,ccode,starttime,endtime,cdays
312,430,28,10,1,Fall,2023,53,16,07:30:00,08:45:00,MJ
145,189,1,20,1,Fall,2025,50,16,07:30:00,08:45:00,MJ
864,1196,9,16,1,V2,2017,60,16,07:30:00,08:45:00,MJ
771,1072,18,26,1,Spring,2021,34,16,07:30:00,08:45:00,MJ
251,338,18,18,1,Spring,2022,32,16,07:30:00,08:45:00,MJ
...,...,...,...,...,...,...,...,...,...,...,...
413,568,19,15,20,Fall,2017,22,100,16:30:00,17:20:00,LWV
222,295,6,30,20,Spring,2018,45,100,16:30:00,17:20:00,LWV
198,269,29,18,20,Spring,2025,31,100,16:30:00,17:20:00,LWV
823,1140,17,20,20,Fall,2020,48,100,16:30:00,17:20:00,LWV


Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity
0,0,22,0,12,Spring,2025,22
1,1,15,12,23,Spring,2025,17
2,2,4,17,12,Spring,2025,42
3,3,9,8,9,Fall,2023,30
4,4,10,40,4,Spring,2022,15
...,...,...,...,...,...,...,...
1328,1328,23,12,12,V2,2017,42
1329,1329,7,19,13,Spring,2025,55
1330,1330,7,29,0,Spring,2018,31
1331,1331,17,14,0,Spring,2023,24


#### 4. Adjust 'MJ' meetings and remove overlaps


In [207]:
df_meeting["starttime"] = pd.to_datetime(df_meeting["starttime"], format="%H:%M:%S").dt.time
df_meeting["endtime"] = pd.to_datetime(df_meeting["endtime"], format="%H:%M:%S").dt.time

# Remove all 'MJ' meetings with start time after 10:15 AM and end time before 12:30 PM
df_meeting = df_meeting[
    ~(
        (df_meeting["cdays"] == "MJ")
        & (df_meeting["starttime"] > pd.to_datetime("10:15", format="%H:%M").time())
        & (df_meeting["endtime"] < pd.to_datetime("12:30", format="%H:%M").time())
    )
]

# Remove all meetings that start after 19:45
df_meeting = df_meeting[
    df_meeting["starttime"] <= pd.to_datetime("19:45", format="%H:%M").time()
]





####  5. All ‘LWV’ sections have the correct hours


#### 6. ‘LWV’ meetings have a duration of 50 minutes; ‘MJ’ meetings have a duration of 75 minutes.


In [208]:
df_meeting.loc[df_meeting["cdays"] == "LWV", "duration"] = 50
df_meeting.loc[df_meeting["cdays"] == "MJ", "duration"] = 75

df_meeting = df_meeting[
    (
        (df_meeting["cdays"] == "LWV")
        & (
            pd.to_datetime(df_meeting["endtime"], format="%H:%M:%S")
            - pd.to_datetime(df_meeting["starttime"], format="%H:%M:%S")
            == pd.Timedelta(minutes=50)
        )
    )
    | (
        (df_meeting["cdays"] == "MJ")
        & (
            pd.to_datetime(df_meeting["endtime"], format="%H:%M:%S")
            - pd.to_datetime(df_meeting["starttime"], format="%H:%M:%S")
            == pd.Timedelta(minutes=75)
        )
    )
]

df_meeting.drop(columns=["duration"], inplace=True)

#### 7. Sections cannot be in overcapacity, classrooms have limits.


In [209]:
df_section_room = df_section.merge(df_room, left_on="roomid", right_on="rid")
df_section_room = df_section_room[
    df_section_room["capacity_x"] <= df_section_room["capacity_y"]
]
df_section = df_section[df_section["sid"].isin(df_section_room["sid"])]

#### 8. Courses must be taught in the correct year and correct semester.


#### 9. Sections must be taught in a valid classroom and meeting, and the class must exist.


In [210]:
df_section = df_section[df_section["roomid"].isin(df_room["rid"])]
df_section = df_section[df_section["mid"].isin(df_meeting["mid"])]
df_section = df_section[df_section["cid"].isin(df_class["cid"])]

#### 10. Delete all section with Dummy class as Foreign Key


In [211]:
dummy_class_ids = df_class[
    df_class["cname"] == "Authorization from the Director of the Department"
]["cid"].tolist()

df_section = df_section[~df_section["cid"].isin(dummy_class_ids)]

In [212]:
# Dsplay Cleaned Section DataFrame
display(df_section)

Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity
9,9,24,26,4,Fall,2025,25
12,12,14,27,19,Fall,2022,28
13,13,22,21,1,Spring,2022,35
15,15,11,15,6,V2,2020,30
16,16,2,35,2,Spring,2019,18
...,...,...,...,...,...,...,...
1281,1281,20,2,17,Spring,2023,16
1289,1289,24,30,5,Fall,2018,15
1308,1308,16,6,8,Spring,2025,23
1314,1314,12,32,17,Spring,2018,23
