# Dataframe Processing

In [3]:
# Get the data from the sample_data folder
import os
import pandas as pd

In [4]:
files_in_current_directory = os.listdir('.')
print(files_in_current_directory)

['gradesync_sheets_to_df.ipynb', 'message_requests', 'message_requests.csv', '.DS_Store', 'requirements.txt', 'config', 'output', 'sample_data', '.gitignore', '.env', 'gradesync_to_df.py', 'df_processing.ipynb']


In [203]:
# Read the data from the sample_data folder and convert to a dataframe
def get_data(file_name):
    # Get the path to the sample_data folder
    sample_data_folder = 'sample_data/'
    file_path = os.path.join(sample_data_folder, file_name)

    # Read the data from the CSV file
    df = pd.read_csv(file_path)

    return df

## 1. Open the files and do pre-processing

In [None]:
def df_to_df(project3_df, input_file_name):
    # Sort project3_df to contain only the 'SID', 'Status', 'Submission Time', 'Lateness (H:M:S)' columns
    project3_filtered_df = project3_df[["b'First Name", 'Last Name', 'SID', 'Email', 'Status', 'Submission Time', 'Lateness (H:M:S)']]
    
    # Add a new column which labeles this data as 'Proj 3: 2048'
    project3_filtered_df = project3_filtered_df.copy()
    project3_filtered_df['Project'] = 'Proj 3: 2048'
    
    
    # Reorder the columns to have 'Project' as the first column
    project3_filtered_df = project3_filtered_df[['Project', "b'First Name", 'Last Name', 'SID', 'Email', 'Status', 'Submission Time', 'Lateness (H:M:S)']]
    
    # Get the data from deadlines.csv
    deadlines_df = get_data('deadlines.csv')
    
    # Get the data from notification_frequency.csv
    notification_frequency_df = get_data('notification_frequency.csv')
    
    # Iterate through the project3_filtered_df, and for each row, add on the due date based on joining the project3_filtered_df and the deadlines_df
    def get_due_date(row):
        # Get the project name from the row
        project_name = row['Project']
    
        # Get the due date from the deadlines_df
        due_date = deadlines_df.loc[deadlines_df['project'] == project_name, 'due'].values[0]
    
        return due_date
    
    project3_filtered_df['Due Date'] = project3_filtered_df.apply(get_due_date, axis=1)
    
    # Import datetime to compare the difference in times
    import datetime
    
    # Convert the 'Submission Time' and 'Due Date' columns to datetime objects
    #project3_filtered_df['Submission Time'] = pd.to_datetime(project3_filtered_df['Submission Time'], format='%Y-%m-%d %H:%M:%S')
    #project3_filtered_df['Due Date'] = pd.to_datetime(project3_filtered_df['Due Date'], format='%Y-%m-%d %H:%M:%S')
    
    project3_filtered_df['Submission Time'] = pd.to_datetime(project3_filtered_df['Submission Time'], errors='coerce', utc=True)
    project3_filtered_df['Due Date'] = pd.to_datetime(project3_filtered_df['Due Date'], errors='coerce')
    
    
    # Get the type of the 'SID' column in project3_filtered_df
    sid_type = project3_filtered_df['SID'].dtype
    
    notification_frequency_df.rename(columns={'sid': 'SID'}, inplace=True)
    
    # Convert the SID column in notification_frequency_df to an object
    notification_frequency_df = notification_frequency_df.astype({'SID': 'object'})
    
    # Get the type of the 'SID' column in project3_filtered_df
    sid_type = notification_frequency_df['SID'].dtype
    
    # Get the difference between SID in the two dataframes
    def get_sid_difference(df1, df2):
        # Get the SIDs from both dataframes
        df1_sids = set(df1['SID'])
        df2_sids = set(df2['SID'])
    
        # Get the difference between the two sets
        sid_difference = df1_sids - df2_sids
    
        return sid_difference
    
    sid_difference = get_sid_difference(project3_filtered_df, notification_frequency_df)
    
    ## 2. Merge the Dataframes
    
    # Merge the project3_filtered_df and notification_frequency_df on the 'SID' column
    merged_df = pd.merge(project3_filtered_df, notification_frequency_df, on='SID', how='left')
    
    # # Convert the notification frequency to timedelta object in notification_frequency_df
    
    # def convert_to_timedelta(row):
    #     # Get the notification frequency from the row
    #     notification_frequency = row['notification_frequency']
    
    #     # Check if the notification_frequency is already a timedelta object
    #     if isinstance(notification_frequency, pd.Timedelta):
    #         return notification_frequency
    
    #     # Extract the number and unit
    #     num, unit = notification_frequency.split()
    
    #     # Convert to timedelta
    #     if unit.startswith('day'):
    #         delta = datetime.timedelta(days=int(num))
    #     elif unit.startswith('hour'):
    #         delta = datetime.timedelta(hours=int(num))
    #     elif unit.startswith('minute'):
    #         delta = datetime.timedelta(minutes=int(num))
    #     else:
    #         raise ValueError("Unsupported time unit.")
    #     return delta 
        
    # merged_df['notification_frequency'] = merged_df.apply(convert_to_timedelta, axis=1)
    # merged_df
    
    # Make the column notification_frequency in merged_df a timedelta object where it is random number from 3-7 days in length
    
    import random
    
    def random_timedelta():
        # Generate a random number of days between 3 and 7
        num_days = random.randint(3, 7)
        return pd.Timedelta(days=num_days)
    merged_df['notification_frequency'] = merged_df['notification_frequency'].apply(lambda x: random_timedelta())
    
    
    # Get the current date, for example let's say today's date is 3-17-25 minus 3 days which is 3-14-25.
    # This will help with the example above
    
    today_date = datetime.datetime(2025, 3, 12, 0, 0, 0)
    
    # Get the difference between the due date and the current date
    def get_date_difference(row):
        # Get the due date from the row
        due_date = row['Due Date']
    
        # Get the difference between the due date and the current date
        date_difference = due_date - today_date
    
        return date_difference
    
    merged_df['Date Difference'] = merged_df.apply(get_date_difference, axis=1)
    
    # Get the type of the notification_frequency column in merged_df
    notification_frequency_type = merged_df['notification_frequency'].dtype
    
    # Get the type of the Date Difference column in merged_df
    date_difference_type = merged_df['Date Difference'].dtype
    
    merged_df['is_equal'] = merged_df['notification_frequency'].dt.days == merged_df['Date Difference'].dt.days
    
    
    ## 3. Create missing_students dataframe
    
    """
    What will the code do:			
    1. Iterate through every single row in this table			
    2. Student SID has not submitted assignment, status = 'missing' from student_data_one_assignment AND today_date == due_date - notification_freq			
    3. Append that student, row, assignment to the message_requests temporary dataframe			
    4. Continue iterating through every single one			
    """
    
    # See which rows where row['Status'] == 'missing' and row['is_equal'] == True:
    missing_students = merged_df[(merged_df['Status'] == 'Missing') & (merged_df['is_equal'] == True)]
    
    
    ## Generate the Message Requests
    
    # Now create a message_requests column in the merged_df dataframe with the f_string message
    def create_message(row):
        # Get the SID from the row
        sid = row['SID']
    
        # Get the project name from the row
        project_name = row['Project']
    
        notification_frequency = row['notification_frequency']
    
        first_name = row["b'First Name"]
        last_name = row['Last Name']
    
        # Create the message
        message = f"Dear {first_name}, your {project_name} assignment is missing and it is due in {notification_frequency}. Please submit it as soon as possible."
    
        return message
    
    # Create the message_requests column in the merged_df dataframe
    missing_students = missing_students.copy()
    missing_students['message_requests'] = missing_students.apply(create_message, axis=1)
    
    # Get the list of message requests
    message_requests = missing_students['message_requests'].tolist()
    
    # This is the output which should go into the message_requests.csv file
    
    # Create a new dataframe with the message requests
    message_requests_df = missing_students[['SID', 'Email', 'message_requests']].copy()
    
    # Save the message requests to a CSV file
    csv_file_name = "message_requests_" + input_file_name
    message_requests_df.to_csv(csv_file_name, index=False)

In [205]:
deadlines_df = pd.read_csv('sample_data/deadlines.csv')
project_list = deadlines_df["project"].iloc[1:].str.replace(":", "", regex=False).str.replace(" ", "_").tolist()
project_list

['Proj_1_Worldle',
 'Proj_2_Spelling_Bee',
 'Proj_3_2048',
 'Proj_4_Tech_in_Context',
 'Proj_6_Final_Project_Proposals']

In [208]:
files_list = []
for i in project_list:
    string = "CS10_Sp25_GradeScope Grades - " + i + ".csv"
    files_list.append(string)

projectans_df = get_data('CS10_Sp25_GradeScope Grades - Proj_3_2048.csv')
files_list

FileNotFoundError: [Errno 2] No such file or directory: 'sample_data/CS10_Sp25_GradeScope Grades - Proj_3_2048.csv'

In [207]:
for j in files_list:
    projectans_df = get_data(j)
    df_to_df(projectans_df, j)

FileNotFoundError: [Errno 2] No such file or directory: 'sample_data/CS10_Sp25_GradeScope Grades - Proj_1_Worldle.csv'