In [1]:
from extract_data import run_etl
import pandas as pd

In [2]:
def update_names():
    # get all dataframes as a dictionary of (df, table_name)
    dataframes = run_etl()

    # Create a dictionary to hold the dataframes
    df_dict = {}
    for df, table_name in dataframes:
        df_dict[f"df_{table_name}"] = df
    return df_dict

In [3]:
# Change the working directory (If this is changed, the following cells will crash because it won't find the data folder)
import os

# Define the relative path
target_path = './../data'
move_path = "./../"

# Check if the directory exists before changing
if os.path.exists(target_path):
    os.chdir(move_path)
    print(f"Changed directory to: {os.getcwd()}")


Changed directory to: /Users/lennon/Github/rumad-v2-app-segmentation-fault


In [4]:
df_dict = update_names()

display_table = "df_section" # Change to display table

In [5]:
# Remove rows with null values across all dataframes
df_dict["df_class"].dropna(inplace=True)
df_dict["df_section"].dropna(inplace=True)
df_dict["df_meeting"].dropna(inplace=True)
df_dict["df_room"].dropna(inplace=True)
df_dict["df_requisite"].dropna(inplace=True)

In [6]:

# Init the dataframes
df_class = df_dict["df_class"]
df_section = df_dict["df_section"]
df_meeting = df_dict["df_meeting"]
df_room = df_dict["df_room"]
df_requisite = df_dict["df_requisite"]

display(df_dict[display_table])

Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity
0,0,22,0,12,Spring,2025,22
1,1,15,12,23,Spring,2025,17
2,2,4,17,12,Spring,2025,42
3,3,9,8,9,Fall,2023,30
4,4,10,40,4,Spring,2022,15
...,...,...,...,...,...,...,...
1328,1328,23,12,12,V2,2017,42
1329,1329,7,19,13,Spring,2025,55
1330,1330,7,29,0,Spring,2018,31
1331,1331,17,14,0,Spring,2023,24


### 1. Ensure that classes have ID starting from 2

In [7]:
df_class["cid"] = pd.to_numeric(df_class["cid"], errors="coerce")
df_class.dropna(subset=["cid"], inplace=True)
df_class = df_class[df_class["cid"] >= 2]
df_section = df_section[df_section["cid"] >= 2]

display(df_dict["df_class"])

Unnamed: 0,cid,cname,ccode,cdesc,term,years,cred,csyllabus
0,2,CIIC,3015,Introduction to Computer Programming I,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
1,3,CIIC,3075,Foundations of Computing,"First Semester, Second Semester",Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
2,4,CIIC,3081,Computer Architecture I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
3,5,CIIC,4010,Advanced Programming,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
4,6,CIIC,4020,Data Structures,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
5,7,CIIC,4025,Analysis and Design of Algorithms,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
6,8,CIIC,4030,Programming Languages,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
7,9,CIIC,4050,Operating Systems,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
8,10,CIIC,4151,Senior Design Project I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
9,11,CIIC,4060,Database Systems,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...


#### 2. Two sections cannot be taught at the same hour in the same classroom. 
#### 3. A class cannot have the same section, they must be taught at different hours.

In [8]:
# Delete sections with duplicate 'sid'
df_section = df_section.drop_duplicates(subset=["sid"], keep=False)

# Merge 'section' with 'meeting' to check for overlapping sections in the same room, time, and semester
df_section_meeting = df_section.merge(df_meeting, on="mid")

# Convert 'starttime' and 'endtime' to time objects
df_section_meeting["starttime"] = pd.to_datetime(
    df_section_meeting["starttime"], format="%H:%M:%S"
).dt.time
df_section_meeting["endtime"] = pd.to_datetime(
    df_section_meeting["endtime"], format="%H:%M:%S"
).dt.time

# Sort the dataframe by room, semester, starttime, and sid
df_section_meeting = df_section_meeting.sort_values(
    ["roomid", "semester", "starttime", "sid"]
)

# Detect overlapping sections (same room, same semester, same time)
overlaps = []
for (roomid, semester, starttime, cdays), group in df_section_meeting.groupby(
    ["roomid", "semester", "starttime", "cdays"]
):  
    
    for i in range(1, len(group)):
        previous = group.iloc[i - 1]
        current = group.iloc[i]

        # Check if sections overlap in the same room, same time, and same semester
        if (
            current["starttime"] == previous["starttime"]
            and current["roomid"] == previous["roomid"]
            and current["semester"] == previous["semester"]
            and current["years"] == previous["years"]
            and current["cdays"] == previous["cdays"]
        ):
            # Add the section with the higher sid to the list of overlaps to delete
            if current["sid"] > previous["sid"]:
                overlaps.append(current["sid"])
            else:
                overlaps.append(previous["sid"])
                
# Remove overlapping sections with the higher sid
df_section = df_section[~df_section["sid"].isin(overlaps)]



#### 4. Adjust 'MJ' meetings and remove overlaps


In [9]:
from IPython.display import display_html
import pandas as pd

# Function to convert 'HH:MM:SS' format to minutes
def convert_to_minutes(time_str):
    hours, minutes, _ = map(int, time_str.split(':'))
    return hours * 60 + minutes

# Function to convert minutes back to 'HH:MM' format
def convert_to_hhmm(total_minutes):
    hours = total_minutes // 60
    minutes = total_minutes % 60
    return f'{hours}:{minutes:02d}' 

# Make a deep copy of df_meeting to preserve original data
df_meeting_prev = df_meeting.copy(deep=True)

# Convert 'starttime' and 'endtime' columns to minutes for easier manipulation
df_meeting[['starttime', 'endtime']] = df_meeting[['starttime', 'endtime']].applymap(convert_to_minutes)

# Filter out meetings on 'MJ' days between 10:15 and 12:30
df_meeting = df_meeting[
    ~(
        (df_meeting['cdays'] == "MJ") & 
        (df_meeting['starttime'] > convert_to_minutes("10:15:00")) & 
        (df_meeting['endtime'] < convert_to_minutes("12:30:00"))
    )
]

# Function to find the index of the earliest and latest class based on condition
def find_class_index(condition):
    try:
        return df_meeting[condition].index[0]
    except IndexError:
        return -1

# Find earliest and latest class times between 10:15 and 12:30 on 'MJ' days
index_earliest_class_after_1030 = find_class_index(
    (df_meeting['cdays'] == "MJ") & 
    (df_meeting['starttime'] > convert_to_minutes("10:15:00")) & 
    (df_meeting['starttime'] < convert_to_minutes("12:30:00"))
)

index_latest_class_before_1230 = find_class_index(
    (df_meeting['cdays'] == "MJ") & 
    (df_meeting['endtime'] < convert_to_minutes("12:30:00")) & 
    (df_meeting['endtime'] > convert_to_minutes("10:15:00"))
)

# Remove all meetings that start after 19:45
df_meeting = df_meeting[df_meeting["starttime"] <= convert_to_minutes("19:45:00")]


# Adjust class timings if valid indices are found
if index_earliest_class_after_1030 != -1:
    delta_time = convert_to_minutes("12:30:00") - df_meeting.loc[index_earliest_class_after_1030, 'starttime']
    df_meeting.loc[(df_meeting['cdays'] == "MJ") & (df_meeting.index >= index_earliest_class_after_1030), ['starttime', 'endtime']] += delta_time

if index_latest_class_before_1230 != -1:
    delta_time = df_meeting.loc[index_earliest_class_after_1030, 'endtime'] - convert_to_minutes("10:15:00")
    df_meeting.loc[(df_meeting['cdays'] == "MJ") & (df_meeting.index <= index_latest_class_before_1230), ['starttime', 'endtime']] -= delta_time

# Convert 'starttime' and 'endtime' columns back to 'HH:MM' format
df_meeting[['starttime', 'endtime']] = df_meeting[['starttime', 'endtime']].applymap(convert_to_hhmm)

# Convert 'starttime' and 'endtime' back to datetime.time format for display purposes
df_meeting["starttime"] = pd.to_datetime(df_meeting["starttime"], format="%H:%M").dt.time
df_meeting["endtime"] = pd.to_datetime(df_meeting["endtime"], format="%H:%M").dt.time

# Create HTML representations of the two DataFrames for side-by-side comparison
df_meeting_html = df_meeting.to_html()
df_meeting_prev_html = df_meeting_prev.to_html()

# Display the two DataFrames side by side using HTML tables
display_html(f"<div style='display: flex; justify-content: space-around;'>"
             f"<div>{df_meeting_prev_html}</div><div>{df_meeting_html}</div></div>", raw=True)

# Doy clases los jueves, no cobro mucho @Onnelle 


  df_meeting[['starttime', 'endtime']] = df_meeting[['starttime', 'endtime']].applymap(convert_to_minutes)
  df_meeting[['starttime', 'endtime']] = df_meeting[['starttime', 'endtime']].applymap(convert_to_hhmm)


Unnamed: 0,mid,ccode,starttime,endtime,cdays
0,1,16,07:30:00,08:45:00,MJ
1,2,26,09:00:00,10:15:00,MJ
2,3,36,10:30:00,11:45:00,MJ
3,4,46,12:00:00,13:15:00,MJ
4,5,56,13:30:00,14:45:00,MJ
5,6,66,15:00:00,16:15:00,MJ
6,7,76,16:30:00,17:45:00,MJ
7,8,86,18:00:00,19:15:00,MJ
8,9,96,19:30:00,20:45:00,MJ
9,10,106,21:00:00,22:15:00,MJ

Unnamed: 0,mid,ccode,starttime,endtime,cdays
0,1,16,07:30:00,08:45:00,MJ
1,2,26,09:00:00,10:15:00,MJ
3,4,46,12:30:00,13:45:00,MJ
4,5,56,14:00:00,15:15:00,MJ
5,6,66,15:30:00,16:45:00,MJ
6,7,76,17:00:00,18:15:00,MJ
7,8,86,18:30:00,19:45:00,MJ
8,9,96,20:00:00,21:15:00,MJ
10,11,10,07:30:00,08:20:00,LWV
11,12,20,08:30:00,09:20:00,LWV


####  5. All ‘LWV’ sections have the correct hours
#### 6. ‘LWV’ meetings have a duration of 50 minutes; ‘MJ’ meetings have a duration of 75 minutes.


In [10]:
df_meeting.loc[df_meeting["cdays"] == "LWV", "duration"] = 50
df_meeting.loc[df_meeting["cdays"] == "MJ", "duration"] = 75

df_meeting = df_meeting[
    (
        (df_meeting["cdays"] == "LWV")
        & (
            pd.to_datetime(df_meeting["endtime"], format="%H:%M:%S")
            - pd.to_datetime(df_meeting["starttime"], format="%H:%M:%S")
            == pd.Timedelta(minutes=50)
        )
    )
    | (
        (df_meeting["cdays"] == "MJ")
        & (
            pd.to_datetime(df_meeting["endtime"], format="%H:%M:%S")
            - pd.to_datetime(df_meeting["starttime"], format="%H:%M:%S")
            == pd.Timedelta(minutes=75)
        )
    )
]

df_meeting.drop(columns=["duration"], inplace=True)

#### 7. Sections cannot be in overcapacity, classrooms have limits.


In [11]:
df_section_room = df_section.merge(df_room, left_on="roomid", right_on="rid")
df_section_room = df_section_room[
    df_section_room["capacity_x"] <= df_section_room["capacity_y"]
]
df_section = df_section[df_section["sid"].isin(df_section_room["sid"])]

#### 8. Courses must be taught in the correct year and correct semester.


In [12]:
df_section_class = df_section.merge(df_class, on="cid")
years_x = pd.to_numeric(df_section_class["years_x"], errors="coerce")

# Boolean Conditions
First_semester = (
    (df_section_class["term"] == "First Semester")
    | (df_section_class["term"] == "First Semester, Second Semester")
) & (df_section_class["semester"] == "Fall")
Second_semester = (
    (df_section_class["term"] == "Second Semester")
    | (df_section_class["term"] == "First Semester, Second Semester")
) & (df_section_class["semester"] == "Spring")
According_Demand = (df_section_class["term"] == "According to Demand") & (
    (df_section_class["semester"] == "Fall")
    | (df_section_class["semester"] == "Spring")
    | (df_section_class["semester"] == "V1")
    | (df_section_class["semester"] == "V2")
)
Even_year = (df_section_class["years_y"] == "Even Years") & ((years_x % 2) == 0)
Odd_year = (df_section_class["years_y"] == "Odd Years") & ((years_x % 2) != 0)
Every_Year = df_section_class["years_y"] == "Every Year"
According_Demand_Year = df_section_class["years_y"] == "According to Demand"

# Combine the boolean conditions into a single series
combined_conditions = (First_semester | Second_semester | According_Demand) & (
    Even_year | Odd_year | Every_Year | According_Demand_Year
)

# Ensure the combined_conditions series has the same index as df_section_class
combined_conditions = combined_conditions.reindex(df_section_class.index)

# Filter the sections based on the combined boolean conditions
df_section_class = df_section_class[~combined_conditions]
# Update the section dataframe
df_section = df_section[~df_section["sid"].isin(df_section_class["sid"])]
display(df_section_class)

Unnamed: 0,sid,roomid,cid,mid,semester,years_x,capacity,cname,ccode,cdesc,term,years_y,cred,csyllabus
2,6,19,21,25,V2,2021,40,CIIC,5045,Automata and Formal Languages,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
5,13,22,21,1,Spring,2022,35,CIIC,5045,Automata and Formal Languages,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
10,20,14,36,22,Fall,2024,29,INSO,5118,Software Engineering Project Management,First Semester,Odd Years,3,https://www.uprm.edu/cse/wp-content/uploads/si...
11,23,25,24,0,Fall,2018,39,CIIC,5130,Cloud Computing Infrastructures,First Semester,Odd Years,3,https://www.uprm.edu/cse/wp-content/uploads/si...
12,28,10,34,3,Spring,2025,21,INSO,4151,Software Engineering Project I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,1255,10,20,25,Fall,2021,20,CIIC,5029,Compilers Development,Second Semester,Odd Years,3,https://www.uprm.edu/cse/wp-content/uploads/si...
345,1271,22,7,20,V2,2017,16,CIIC,4025,Analysis and Design of Algorithms,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
350,1293,22,18,25,Fall,2019,20,CIIC,5018,Cryptography and Network Security,Second Semester,Even Years,3,https://www.uprm.edu/cse/wp-content/uploads/si...
351,1298,1,34,0,Spring,2023,69,INSO,4151,Software Engineering Project I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...


#### 9. Sections must be taught in a valid classroom and meeting, and the class must exist.


In [13]:
df_section = df_section[df_section["roomid"].isin(df_room["rid"])]
df_section = df_section[df_section["mid"].isin(df_meeting["mid"])]
df_section = df_section[df_section["cid"].isin(df_class["cid"])]

#### 10. Delete all section with Dummy class as Foreign Key


In [14]:
dummy_class_ids = df_class[
    df_class["cname"] == "Authorization from the Director of the Department"
]["cid"].tolist()

df_section = df_section[~df_section["cid"].isin(dummy_class_ids)]

In [15]:
# Dsplay Cleaned Section DataFrame
print(f"Length of df_section: {len(df_section)}")
print(f"Length of df_class: {len(df_class)}")
print(f"Length of df_meeting: {len(df_meeting)}")
print(f"Length of df_room: {len(df_room)}")
print(f"Length of df_requisite: {len(df_requisite)}")
print(f"Total length of all dataframes: {len(df_section) + len(df_class) + len(df_meeting) + len(df_room) + len(df_requisite)}")

display(df_section)
display(df_class)
display(df_meeting)
display(df_room)
display(df_requisite)

Length of df_section: 135
Length of df_class: 36
Length of df_meeting: 18
Length of df_room: 25
Length of df_requisite: 59
Total length of all dataframes: 273


Unnamed: 0,sid,roomid,cid,mid,semester,years,capacity
9,9,24,26,4,Fall,2025,25
12,12,14,27,19,Fall,2022,28
15,15,11,15,6,V2,2020,30
16,16,2,35,2,Spring,2019,18
17,17,1,3,17,Spring,2021,16
...,...,...,...,...,...,...,...
1276,1276,22,28,13,Spring,2017,16
1281,1281,20,2,17,Spring,2023,16
1289,1289,24,30,5,Fall,2018,15
1308,1308,16,6,8,Spring,2025,23


Unnamed: 0,cid,cname,ccode,cdesc,term,years,cred,csyllabus
0,2,CIIC,3015,Introduction to Computer Programming I,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
1,3,CIIC,3075,Foundations of Computing,"First Semester, Second Semester",Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
2,4,CIIC,3081,Computer Architecture I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
3,5,CIIC,4010,Advanced Programming,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
4,6,CIIC,4020,Data Structures,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
5,7,CIIC,4025,Analysis and Design of Algorithms,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
6,8,CIIC,4030,Programming Languages,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
7,9,CIIC,4050,Operating Systems,"First Semester, Second Semester",Every Year,4,https://www.uprm.edu/cse/wp-content/uploads/si...
8,10,CIIC,4151,Senior Design Project I,First Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...
9,11,CIIC,4060,Database Systems,Second Semester,Every Year,3,https://www.uprm.edu/cse/wp-content/uploads/si...


Unnamed: 0,mid,ccode,starttime,endtime,cdays
0,1,16,07:30:00,08:45:00,MJ
1,2,26,09:00:00,10:15:00,MJ
3,4,46,12:30:00,13:45:00,MJ
4,5,56,14:00:00,15:15:00,MJ
5,6,66,15:30:00,16:45:00,MJ
6,7,76,17:00:00,18:15:00,MJ
7,8,86,18:30:00,19:45:00,MJ
8,9,96,20:00:00,21:15:00,MJ
10,11,10,07:30:00,08:20:00,LWV
11,12,20,08:30:00,09:20:00,LWV


Unnamed: 0,rid,building,room_number,capacity
0,1,Stefani,113,120
1,2,Stefani,114,25
2,3,Stefani,121,30
3,4,Stefani,215,30
4,5,Stefani,226,40
5,6,Stefani,207,30
6,7,Stefani,214,30
7,8,Stefani,322,30
8,9,Stefani,317,28
9,10,Stefani,330,30


Unnamed: 0,classid,reqid,prereq
0,3,2,1
1,4,2,1
2,5,2,1
3,6,5,1
4,6,3,1
5,7,6,1
6,8,6,1
7,9,6,1
8,9,13,1
9,10,7,1
