# Dataframe Processing

In [1]:
# Get the data from the sample_data folder
import os
import pandas as pd

In [2]:
files_in_current_directory = os.listdir('.')
print(files_in_current_directory)

['gradesync_sheets_to_df.ipynb', 'message_requests', 'message_requests.csv', '.DS_Store', 'requirements.txt', 'config', 'output', 'sample_data', '.gitignore', '.env', 'df_processing copy.ipynb', 'gradesync_to_df.py', 'df_processing.ipynb']


In [3]:
# Read the data from the sample_data folder and convert to a dataframe
def get_data(file_name):
    # Get the path to the sample_data folder
    sample_data_folder = 'sample_data/'
    file_path = os.path.join(sample_data_folder, file_name)

    # Read the data from the CSV file
    df = pd.read_csv(file_path)

    return df

## 1. Open the files and do pre-processing

In [16]:
project3_df = get_data('CS10_Sp25_GradeScope Grades - Project 3_2048.csv')
project3_df.head()

Unnamed: 0,b'First Name,Last Name,SID,Email,Sections,Total Score,Max Points,Status,Submission ID,Submission Time,Lateness (H:M:S),View Count,Submission Count,1: Autograder (35.0 pts)
0,Anita,Qian,3040005244,anita_qian@berkeley.edu,compsci-10-dis-801-in-person,35.0,35,Graded,314621931.0,2025-03-05 10:45:24 -0800,0:00:00,0.0,1.0,35.0
1,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,compsci-10-dis-801-in-person,35.0,35,Graded,314680106.0,2025-03-05 13:17:20 -0800,0:00:00,0.0,2.0,35.0
2,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,compsci-10-dis-801-in-person,35.0,35,Graded,314713498.0,2025-03-05 14:46:33 -0800,0:00:00,0.0,1.0,35.0
3,Jack,Hulse,3040081294,johnhulse@berkeley.edu,compsci-10-dis-801-in-person,35.0,35,Graded,314713498.0,2025-03-05 14:46:33 -0800,0:00:00,0.0,1.0,35.0
4,Kelly,Chou,3038568093,kelly0919@berkeley.edu,compsci-10-dis-801-in-person,30.35,35,Graded,314886910.0,2025-03-06 00:34:41 -0800,0:00:00,0.0,1.0,30.35


## Preprocessing Data

In [19]:
# Sort project3_df to contain only the 'SID', 'Status', 'Submission Time', 'Lateness (H:M:S)' columns
project3_filtered_df = project3_df[["b'First Name", 'Last Name', 'SID', 'Email', 'Status', 'Submission Time', 'Lateness (H:M:S)']]

# Add a new column which labeles this data as 'Proj 3: 2048'
project3_filtered_df = project3_filtered_df.copy()
project3_filtered_df['Assignment'] = 'Proj 3: 2048'

# Reorder the columns to have 'Project' as the first column
project3_filtered_df = project3_filtered_df[['Assignment', "b'First Name", 'Last Name', 'SID', 'Email', 'Status', 'Submission Time', 'Lateness (H:M:S)']]

# Rename the column with b'First Name to be First Name
project3_filtered_df.rename(columns={"b'First Name": 'First Name'}, inplace=True)
# Lowercase all of the column names
project3_filtered_df.columns = project3_filtered_df.columns.str.lower()
# Remove the leading and trailing whitespace from the column names
project3_filtered_df.columns = project3_filtered_df.columns.str.strip()

project3_filtered_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s)
0,Proj 3: 2048,Anita,Qian,3040005244,anita_qian@berkeley.edu,Graded,2025-03-05 10:45:24 -0800,0:00:00
1,Proj 3: 2048,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-03-05 13:17:20 -0800,0:00:00
2,Proj 3: 2048,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,Graded,2025-03-05 14:46:33 -0800,0:00:00
3,Proj 3: 2048,Jack,Hulse,3040081294,johnhulse@berkeley.edu,Graded,2025-03-05 14:46:33 -0800,0:00:00
4,Proj 3: 2048,Kelly,Chou,3038568093,kelly0919@berkeley.edu,Graded,2025-03-06 00:34:41 -0800,0:00:00


## Integrating with deadlines

In [13]:
 # Get the data from deadlines.csv
deadlines_df = get_data('deadlines.csv')
deadlines_df.head()

Unnamed: 0,id,assignment,due,done,time_submitted
0,0,Presemester Survey,2025-01-31T23:59:59,0,
1,1,Proj 1: Worldle,2025-02-05T23:59:59,0,
2,2,Proj 2: Spelling Bee,2025-02-24T23:59:59,0,
3,3,Proj 3: 2048,2025-03-17T23:59:59,0,
4,4,Proj 4: Tech in Context,2025-03-31T23:59:59,0,


In [None]:

# Iterate through the project3_filtered_df, and for each row, add on the due date based on joining the project3_filtered_df and the deadlines_df
def get_due_date(row):
    # Get the project name from the row
    project_name = row['assignment']

    # Get the due date from the deadlines_df
    due_date = deadlines_df.loc[deadlines_df['assignment'] == project_name, 'due'].values[0]

    return due_date

project3_filtered_df['due date'] = project3_filtered_df.apply(get_due_date, axis=1)
project3_filtered_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date
0,Proj 3: 2048,Anita,Qian,3040005244,anita_qian@berkeley.edu,Graded,2025-03-05 10:45:24 -0800,0:00:00,2025-03-17T23:59:59
1,Proj 3: 2048,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-03-05 13:17:20 -0800,0:00:00,2025-03-17T23:59:59
2,Proj 3: 2048,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,Graded,2025-03-05 14:46:33 -0800,0:00:00,2025-03-17T23:59:59
3,Proj 3: 2048,Jack,Hulse,3040081294,johnhulse@berkeley.edu,Graded,2025-03-05 14:46:33 -0800,0:00:00,2025-03-17T23:59:59
4,Proj 3: 2048,Kelly,Chou,3038568093,kelly0919@berkeley.edu,Graded,2025-03-06 00:34:41 -0800,0:00:00,2025-03-17T23:59:59


In [None]:
# Convert the due date and submission time to datetime objects, in order to compare the dates for the date difference

project3_filtered_df['submission time'] = pd.to_datetime(project3_filtered_df['submission time'], errors='coerce', utc=True)
project3_filtered_df['due date'] = pd.to_datetime(project3_filtered_df['due date'], errors='coerce')

project3_filtered_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date
0,Proj 3: 2048,Anita,Qian,3040005244,anita_qian@berkeley.edu,Graded,2025-03-05 18:45:24+00:00,0:00:00,2025-03-17 23:59:59
1,Proj 3: 2048,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-03-05 21:17:20+00:00,0:00:00,2025-03-17 23:59:59
2,Proj 3: 2048,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59
3,Proj 3: 2048,Jack,Hulse,3040081294,johnhulse@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59
4,Proj 3: 2048,Kelly,Chou,3038568093,kelly0919@berkeley.edu,Graded,2025-03-06 08:34:41+00:00,0:00:00,2025-03-17 23:59:59


## Integrating with Notification Frequency

In [None]:
# This is where we should get the input from GradeView from the student's configuration

import random

def random_timedelta():
    # Generate a random number of days between 3 and 7
    num_days = random.randint(3, 7)
    return pd.Timedelta(days=num_days)

# Create a new column called notification_frequency

project3_filtered_df['notification_frequency'] = pd.Timedelta(days=0)
project3_filtered_df['notification_frequency'] = project3_filtered_df['notification_frequency'].apply(lambda x: random_timedelta())
project3_filtered_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency
0,Proj 3: 2048,Anita,Qian,3040005244,anita_qian@berkeley.edu,Graded,2025-03-05 18:45:24+00:00,0:00:00,2025-03-17 23:59:59,5 days
1,Proj 3: 2048,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-03-05 21:17:20+00:00,0:00:00,2025-03-17 23:59:59,4 days
2,Proj 3: 2048,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59,4 days
3,Proj 3: 2048,Jack,Hulse,3040081294,johnhulse@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59,5 days
4,Proj 3: 2048,Kelly,Chou,3038568093,kelly0919@berkeley.edu,Graded,2025-03-06 08:34:41+00:00,0:00:00,2025-03-17 23:59:59,5 days


## Date Comparison Logic

In [None]:
# Create a test case where today's date is set to 2025-03-12 such that the difference 
# between the due date 2025-03-17 and the current date is 5 days which is the notification frequency

import datetime

today_date = datetime.datetime(2025, 3, 12, 0, 0, 0)
today_date

datetime.datetime(2025, 3, 12, 0, 0)

In [31]:
# Get the difference between the due date and the current date
def get_date_difference(row):
    # Get the due date from the row
    due_date = row['due date']

    # Get the difference between the due date and the current date
    date_difference = due_date - today_date

    return date_difference

# Applying the date difference function between today's date and the due date
project3_filtered_df['date_diff'] = project3_filtered_df.apply(get_date_difference, axis=1)
project3_filtered_df.head()

# Get the type of the notification_frequency column in merged_df
notification_frequency_type = project3_filtered_df['notification_frequency'].dtype

# Get the type of the Date Difference column in merged_df
date_difference_type = project3_filtered_df['date_diff'].dtype

project3_filtered_df['is_equal'] = project3_filtered_df['notification_frequency'].dt.days == project3_filtered_df['date_diff'].dt.days
project3_filtered_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal
0,Proj 3: 2048,Anita,Qian,3040005244,anita_qian@berkeley.edu,Graded,2025-03-05 18:45:24+00:00,0:00:00,2025-03-17 23:59:59,5 days,5 days 23:59:59,True
1,Proj 3: 2048,Malachite,McEvoy,3036547996,malachitemcevoy@berkeley.edu,Graded,2025-03-05 21:17:20+00:00,0:00:00,2025-03-17 23:59:59,4 days,5 days 23:59:59,False
2,Proj 3: 2048,Abigail,Johnson,3040928660,abigail.johnson@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59,4 days,5 days 23:59:59,False
3,Proj 3: 2048,Jack,Hulse,3040081294,johnhulse@berkeley.edu,Graded,2025-03-05 22:46:33+00:00,0:00:00,2025-03-17 23:59:59,5 days,5 days 23:59:59,True
4,Proj 3: 2048,Kelly,Chou,3038568093,kelly0919@berkeley.edu,Graded,2025-03-06 08:34:41+00:00,0:00:00,2025-03-17 23:59:59,5 days,5 days 23:59:59,True


## Create Missing Students DataFrame

In [32]:
 """
What will the code do:			
1. Iterate through every single row in this table			
2. Student SID has not submitted assignment, status = 'missing' from student_data_one_assignment AND today_date == due_date - notification_freq			
3. Append that student, row, assignment to the message_requests temporary dataframe			
4. Continue iterating through every single one			
"""

"\nWhat will the code do:\t\t\t\n1. Iterate through every single row in this table\t\t\t\n2. Student SID has not submitted assignment, status = 'missing' from student_data_one_assignment AND today_date == due_date - notification_freq\t\t\t\n3. Append that student, row, assignment to the message_requests temporary dataframe\t\t\t\n4. Continue iterating through every single one\t\t\t\n"

In [37]:
# See which rows where row['Status'] == 'missing' and row['is_equal'] == True:
missing_students_df = project3_filtered_df[(project3_filtered_df['status'] == 'Missing') & (project3_filtered_df['is_equal'] == True)]
missing_students_df.head()

Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal
39,Proj 3: 2048,test4,test4,,test4@test.com,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True
57,Proj 3: 2048,Zahra,Mokhtari,3040528611.0,zahra_mokhtari@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True
61,Proj 3: 2048,Erin,ZHOU,3040947731.0,uerinzhou@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True
63,Proj 3: 2048,Felipe,Ruiz-Tagle,3040705060.0,felipe.ruiztagle@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True


## Create the Message Requests

In [38]:
# Now create a message_requests column in the merged_df dataframe with the f_string message
def create_message(row):
    # Get the project name from the row
    assignment_name = row['assignment']

    notification_frequency = row['notification_frequency']

    first_name = row["first name"]
    last_name = row['last name']

    # Create the message
    message = f"Dear {first_name}, your {assignment_name} assignment is missing and it is due in {notification_frequency}. Please submit it as soon as possible."

    return message

In [39]:
# Create the message_requests column in the merged_df dataframe
message_requests_df = missing_students_df.copy()
message_requests_df['message_requests'] = message_requests_df.apply(create_message, axis=1)
message_requests_df.head()


Unnamed: 0,assignment,first name,last name,sid,email,status,submission time,lateness (h:m:s),due date,notification_frequency,date_diff,is_equal,message_requests
39,Proj 3: 2048,test4,test4,,test4@test.com,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True,"Dear test4, your Proj 3: 2048 assignment is mi..."
57,Proj 3: 2048,Zahra,Mokhtari,3040528611.0,zahra_mokhtari@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True,"Dear Zahra, your Proj 3: 2048 assignment is mi..."
61,Proj 3: 2048,Erin,ZHOU,3040947731.0,uerinzhou@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True,"Dear Erin, your Proj 3: 2048 assignment is mis..."
63,Proj 3: 2048,Felipe,Ruiz-Tagle,3040705060.0,felipe.ruiztagle@berkeley.edu,Missing,NaT,,2025-03-17 23:59:59,5 days,5 days 23:59:59,True,"Dear Felipe, your Proj 3: 2048 assignment is m..."


In [41]:
# Simplify this down to the information that is necessary
message_requests_df_condensed = message_requests_df[['sid', 'email', 'assignment', 'message_requests']]
message_requests_df_condensed.head()

Unnamed: 0,sid,email,assignment,message_requests
39,,test4@test.com,Proj 3: 2048,"Dear test4, your Proj 3: 2048 assignment is mi..."
57,3040528611.0,zahra_mokhtari@berkeley.edu,Proj 3: 2048,"Dear Zahra, your Proj 3: 2048 assignment is mi..."
61,3040947731.0,uerinzhou@berkeley.edu,Proj 3: 2048,"Dear Erin, your Proj 3: 2048 assignment is mis..."
63,3040705060.0,felipe.ruiztagle@berkeley.edu,Proj 3: 2048,"Dear Felipe, your Proj 3: 2048 assignment is m..."


In [42]:
# Save the message requests to a csv file 

# Save the message requests to a CSV file
csv_file_name = "message_requests_" + message_requests_df_condensed['assignment'].iloc[0] + ".csv"
message_requests_df.to_csv(csv_file_name, index=False)

List of things to be done:
1. notification_frequency: we are generating a synethic set of dates through random. Instead, we should ideally have the prefernces (that come from GradeView) about the student's notification frequency integrated into it.
  - Read from the notification_frequency.csv file for example, but make sure that the IDs are correct and similar to our test case of project 3
2. Integrate the temporary deadlines with the set of files
3. Right now, this is all done for a SINGLE project. The next job is to generalize it, which means iterating through the output folder, and for each project / lab in the output folder, work with that dataframe and the deadlines and the notification_frequency to get this going
4. Create an output of a set of message_request dataframes. This could be a mapping of the assignment name --> message request dataframe
  - Exploded dataframe