# Dataframe Processing

In [3]:
# Get the data from the sample_data folder
import os
import pandas as pd
import numpy as np

In [4]:
files_in_current_directory = os.listdir('.')
print(files_in_current_directory)

['gradesync_sheets_to_df.ipynb', 'message_requests', 'df_to_message_requests.py', 'generate_assignment_deadlines.ipynb', '.DS_Store', 'old_files', 'requirements.txt', 'config', 'shared_data', 'df_to_message_requests_single.py', 'df_processing_manan_output.ipynb', 'output', '__pycache__', 'gradesync_to_df.ipynb', 'output_files.csv', 'deadlines_temp.ipynb', 'sample_data', '.gitignore', '.env', 'notification_frequency_temp.ipynb', 'gradesync_to_df.py', 'df_processing.ipynb', 'main.py']


In [5]:
# Read the data from the sample_data folder and convert to a dataframe
def get_data(file_name):
    # Get the path to the sample_data folder
    folder = 'output/'
    file_path = os.path.join(folder, file_name)

    # Read the data from the CSV file
    df = pd.read_csv(file_path)

    return df

## 1. Open the files and do pre-processing

In [6]:
assignment_df = get_data('Project 1: Wordlexe2x84xa2-lite.csv')
assignment_df

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s)
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 14:36:02 -0800,0:00:00
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-30 18:06:41 -0800,0:00:00
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-30 18:06:41 -0800,0:00:00
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 12:35:45 -0800,0:00:00
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 12:35:45 -0800,0:00:00
...,...,...,...,...,...,...,...,...
63,Project 1: Wordlexe2x84xa2-lite,Kaixing,Zhang,3040937812,keson@berkeley.edu,Missing,,
64,Project 1: Wordlexe2x84xa2-lite,Erin,ZHOU,3040947731,uerinzhou@berkeley.edu,Missing,,
65,Project 1: Wordlexe2x84xa2-lite,Efe,Atli,23772813,eatli@berkeley.edu,Missing,,
66,Project 1: Wordlexe2x84xa2-lite,Felipe,Ruiz-Tagle,3040705060,felipe.ruiztagle@berkeley.edu,Missing,,


## Integrating with deadlines

In [9]:
#  # Get the data from deadlines.csv
deadlines_df = get_data('deadlines.csv')
deadlines_df

# file_name = 'random_deadlines.csv'
# folder = 'shared_data/'
# file_path = os.path.join(folder, file_name)
# deadlines_df = pd.read_csv(file_path)
# deadlines_df

Unnamed: 0,assignment,due
0,Presemester Survey,2025-01-31 23:59:59
1,Project 1: Wordlexe2x84xa2-lite,2025-02-05 23:59:59
2,Project 2: Spelling Bee,2025-02-24 23:59:59
3,Project 3: 2048,2025-03-17 23:59:59
4,Project 4: Artifact + Documentation,2025-03-31 23:59:59
5,Project 4: Comments + Peer Feedback,2025-04-07 23:59:59
6,Project 5: Final Project Proposal,2025-04-14 23:59:59
7,Project 5: Final Project Submission,2025-04-14 23:59:59
8,Project 6: Final Project Proposal,2025-04-14 23:59:59
9,Project 6: Final Project Submission,2025-04-14 23:59:59


In [8]:
# Iterate through the assignment_filtered_df, and for each row, add on the due date based on joining the project3_filtered_df and the deadlines_df
def get_due_date(row):
    # Get the project name from the row
    project_name = row['assignment']

    # Get the due date from the deadlines_df
    due_date = deadlines_df.loc[deadlines_df['assignment'] == project_name, 'due'].values[0]

    return due_date

assignment_df['due date'] = assignment_df.apply(get_due_date, axis=1)
assignment_df.head()

KeyError: 'assignment'

In [57]:
# Convert the due date and submission time to datetime objects, in order to compare the dates for the date difference
assignment_df['submission time'] = pd.to_datetime(assignment_df['submission time'], errors='coerce', utc=True)
assignment_df['due date'] = pd.to_datetime(assignment_df['due date'], errors='coerce')
assignment_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 22:36:02+00:00,0:00:00,2025-02-05 23:59:59
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59


## Integrating with Notification Frequency

In [58]:
assignment_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 22:36:02+00:00,0:00:00,2025-02-05 23:59:59
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59


In [59]:
notification_frequency_df = get_data('notification_frequency.csv')
notification_frequency_df

Unnamed: 0,first name,last name,sid,notification_frequency
0,Anita,Qian,3040005244,3
1,Malachite,McEvoy,3036547996,4
2,Abigail,Johnson,3040928660,5
3,Jack,Hulse,3040081294,2
4,Kelly,Chou,3038568093,1
...,...,...,...,...
60,Kaixing,Zhang,3040937812,4
61,Erin,ZHOU,3040947731,5
62,Efe,Atli,23772813,2
63,Felipe,Ruiz-Tagle,3040705060,1


In [60]:
for i, row in assignment_df.iterrows():
    first_name = row['first name']
    last_name = row['last name']

    match = notification_frequency_df.loc[
        (notification_frequency_df['first name'] == first_name) & 
        (notification_frequency_df['last name'] == last_name)
    ]

    if match.empty:
        print(f"No match for: {first_name} {last_name}")

No match for: Ella Pockman
No match for: Fangyin Ge
No match for: Xueyuan Cheng


In [61]:
# Match the first and last name column from the assignment_filtered_df with the first name and last name column from the notification_frequency_df 
def get_notification_frequency(row):
    first_name = row['first name']
    last_name = row['last name']

    # Check if the first and last name match in the notification_frequency_df
    match = notification_frequency_df.loc[
        (notification_frequency_df['first name'] == first_name) &
        (notification_frequency_df['last name'] == last_name),
        'notification_frequency'
    ]

    # If there is a match, return the notification frequency
    if not match.empty:
        return match.values[0]
    else:
        # Make the default reminder frequency = 3 days before the due date
        return 3  # or "weekly", or np.nan depending on your use case

In [62]:
# Create a new column called 'notification_frequency' in the assignment_filtered_df
assignment_df['notification_frequency'] = assignment_df.apply(get_notification_frequency, axis=1)
assignment_df

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 22:36:02+00:00,0:00:00,2025-02-05 23:59:59,4
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,1
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,3
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,2
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,5
...,...,...,...,...,...,...,...,...,...,...
63,Project 1: Wordlexe2x84xa2-lite,Kaixing,Zhang,3040937812,keson@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,4
64,Project 1: Wordlexe2x84xa2-lite,Erin,ZHOU,3040947731,uerinzhou@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5
65,Project 1: Wordlexe2x84xa2-lite,Efe,Atli,23772813,eatli@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,2
66,Project 1: Wordlexe2x84xa2-lite,Felipe,Ruiz-Tagle,3040705060,felipe.ruiztagle@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,1


In [63]:
# show assignment_df where notification_frequency = nan
assignment_df[assignment_df['notification_frequency'].isna()]


Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency


In [64]:
# # Convert the notification_frequency column in assignment_filtered_df to a datetime object
assignment_df['notification_frequency'] = assignment_df['notification_frequency'].apply(lambda x: pd.Timedelta(days=x))
assignment_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 22:36:02+00:00,0:00:00,2025-02-05 23:59:59,4 days
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,1 days
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,3 days
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,2 days
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,5 days


In [65]:
assignment_df['notification_frequency'].dtypes

dtype('<m8[ns]')

## Date Comparison Logic

In [67]:
# Get the difference between the due date and the current date
def get_date_difference(row):
    # Get the due date from the row
    due_date = row['due date']

    # Make today's date as the due_date minus 5 days 
    today_date = row['due date'] - pd.Timedelta(days=5)

    # Get the difference between the due date and the current date
    date_difference = due_date - today_date

    return date_difference

# Applying the date difference function between today's date and the due date
assignment_df['date_diff'] = assignment_df.apply(get_date_difference, axis=1)
assignment_df.head()

# Get the type of the notification_frequency column in merged_df
notification_frequency_type = assignment_df['notification_frequency'].dtype

# Get the type of the Date Difference column in merged_df
date_difference_type = assignment_df['date_diff'].dtype

assignment_df['is_equal'] = assignment_df['notification_frequency'].dt.days == assignment_df['date_diff'].dt.days
assignment_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal
0,Project 1: Wordlexe2x84xa2-lite,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-01-22 22:36:02+00:00,0:00:00,2025-02-05 23:59:59,4 days,5 days,False
1,Project 1: Wordlexe2x84xa2-lite,Jiuxi,Wang,3039900360,jiuxi_wang@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,1 days,5 days,False
2,Project 1: Wordlexe2x84xa2-lite,Qiqing,Deng,3040948524,dqq22071011@berkeley.edu,Graded,2025-01-31 02:06:41+00:00,0:00:00,2025-02-05 23:59:59,3 days,5 days,False
3,Project 1: Wordlexe2x84xa2-lite,Jialing,Zhou,3040940542,jialingzhou@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,2 days,5 days,False
4,Project 1: Wordlexe2x84xa2-lite,Yutong,Sun,3040864726,sun0226@berkeley.edu,Graded,2025-02-02 20:35:45+00:00,0:00:00,2025-02-05 23:59:59,5 days,5 days,True


## Create Missing Students DataFrame

In [68]:
"""
What will the code do:			
1. Iterate through every single row in this table			
2. Student SID has not submitted assignment, status = 'missing' from student_data_one_assignment AND today_date == due_date - notification_freq			
3. Append that student, row, assignment to the message_requests temporary dataframe			
4. Continue iterating through every single one			
"""

"\nWhat will the code do:\t\t\t\n1. Iterate through every single row in this table\t\t\t\n2. Student SID has not submitted assignment, status = 'missing' from student_data_one_assignment AND today_date == due_date - notification_freq\t\t\t\n3. Append that student, row, assignment to the message_requests temporary dataframe\t\t\t\n4. Continue iterating through every single one\t\t\t\n"

In [69]:
# See which rows where row['Status'] == 'missing' and row['is_equal'] == True:
missing_students_df = assignment_df[(assignment_df['status'] == 'Missing') & (assignment_df['is_equal'] == True)]
missing_students_df

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal
48,Project 1: Wordlexe2x84xa2-lite,test3,test3,,test3@test.com,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True
53,Project 1: Wordlexe2x84xa2-lite,Leona,Katibah,,leonakatibah@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True
57,Project 1: Wordlexe2x84xa2-lite,Qurratul,Sanjida,,qurratulain.508@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True
64,Project 1: Wordlexe2x84xa2-lite,Erin,ZHOU,3040947731.0,uerinzhou@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True
67,Project 1: Wordlexe2x84xa2-lite,Jerome,Martel,,jerome.martel.t@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True


## Create the Message Requests

In [70]:
# Now create a message_requests column in the merged_df dataframe with the f_string message
def create_message(row):
    # Get the project name from the row
    assignment_name = row['assignment']

    notification_frequency = row['notification_frequency']

    first_name = row["first name"]
    last_name = row['last name']

    # Create the message
    message = f"Dear {first_name}, your {assignment_name} assignment is missing and it is due in {notification_frequency}. Please submit it as soon as possible."

    return message

In [71]:
# Create the message_requests column in the merged_df dataframe
message_requests_df = missing_students_df.copy()
message_requests_df['message_requests'] = message_requests_df.apply(create_message, axis=1)
message_requests_df.head()


Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal,message_requests
48,Project 1: Wordlexe2x84xa2-lite,test3,test3,,test3@test.com,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True,"Dear test3, your Project 1: Wordlexe2x84xa2-li..."
53,Project 1: Wordlexe2x84xa2-lite,Leona,Katibah,,leonakatibah@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True,"Dear Leona, your Project 1: Wordlexe2x84xa2-li..."
57,Project 1: Wordlexe2x84xa2-lite,Qurratul,Sanjida,,qurratulain.508@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True,"Dear Qurratul, your Project 1: Wordlexe2x84xa2..."
64,Project 1: Wordlexe2x84xa2-lite,Erin,ZHOU,3040947731.0,uerinzhou@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True,"Dear Erin, your Project 1: Wordlexe2x84xa2-lit..."
67,Project 1: Wordlexe2x84xa2-lite,Jerome,Martel,,jerome.martel.t@berkeley.edu,Missing,NaT,,2025-02-05 23:59:59,5 days,5 days,True,"Dear Jerome, your Project 1: Wordlexe2x84xa2-l..."


In [72]:
# Simplify this down to the information that is necessary
message_requests_df_condensed = message_requests_df[['first name', 'last name', 'sid', 'email', 'assignment', 'message_requests']]
message_requests_df_condensed.head()

Unnamed: 0,first name,last name,sid,email,assignment,message_requests
48,test3,test3,,test3@test.com,Project 1: Wordlexe2x84xa2-lite,"Dear test3, your Project 1: Wordlexe2x84xa2-li..."
53,Leona,Katibah,,leonakatibah@berkeley.edu,Project 1: Wordlexe2x84xa2-lite,"Dear Leona, your Project 1: Wordlexe2x84xa2-li..."
57,Qurratul,Sanjida,,qurratulain.508@berkeley.edu,Project 1: Wordlexe2x84xa2-lite,"Dear Qurratul, your Project 1: Wordlexe2x84xa2..."
64,Erin,ZHOU,3040947731.0,uerinzhou@berkeley.edu,Project 1: Wordlexe2x84xa2-lite,"Dear Erin, your Project 1: Wordlexe2x84xa2-lit..."
67,Jerome,Martel,,jerome.martel.t@berkeley.edu,Project 1: Wordlexe2x84xa2-lite,"Dear Jerome, your Project 1: Wordlexe2x84xa2-l..."


In [73]:
# Save the message requests to a csv file 

# Save the message requests to a CSV file
csv_file_name = "message_requests_" + message_requests_df_condensed['assignment'].iloc[0] + ".csv"
message_requests_df.to_csv(csv_file_name, index=False)

In [74]:
# Save the dataframe to the message_requests/ folder
message_requests_folder = 'message_requests/'
if not os.path.exists(message_requests_folder):
    os.makedirs(message_requests_folder)
message_requests_df_condensed.to_csv(os.path.join(message_requests_folder, csv_file_name), index=False)
print(f"Message requests saved to {os.path.join(message_requests_folder, csv_file_name)}")

Message requests saved to message_requests/message_requests_Project 1: Wordlexe2x84xa2-lite.csv


List of things to be done:
1. notification_frequency: we are generating a synethic set of dates through random. Instead, we should ideally have the prefernces (that come from GradeView) about the student's notification frequency integrated into it.
  - Read from the notification_frequency.csv file for example, but make sure that the IDs are correct and similar to our test case of project 3
2. Integrate the temporary deadlines with the set of files
3. Right now, this is all done for a SINGLE project. The next job is to generalize it, which means iterating through the output folder, and for each project / lab in the output folder, work with that dataframe and the deadlines and the notification_frequency to get this going
4. Create an output of a set of message_request dataframes. This could be a mapping of the assignment name --> message request dataframe
  - Exploded dataframe