## Procrastination analysis

This notebook calculates procrastinator and regular index, and identify procrastinator student-semester pairs.

In [None]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import time
from datetime import datetime, timedelta

df = pd.read_csv(TRANSACTION_DATA)
semester_dict = {'2012 Fall': 2128,'2013 Spring': 2132,'2013 Summer': 2135,'2013 Fall': 2138,'2014 Spring': 2142,'2014 Summer': 2145,'2014 Fall': 2148,'2015 Spring': 2152,'2015 Summer': 2155,'2015 Fall': 2158,'2016 Spring': 2162,'2016 Summer': 2165,'2016 Fall': 2168,'2017 Spring': 2172,'2017 Summer': 2175,'2017 Fall': 2178,'2018 Spring': 2182,'2018 Summer': 2185,'2018 Fall': 2188,'2019 Spring': 2192,'2019 Summer': 2195,'2019 Fall': 2198,'2020 Spring': 2202,'2020 Summer': 2205,'2020 Fall': 2208,'2021 Spring': 2212,'2021 Summer': 2215,'2021 Fall': 2218,'2022 Spring': 2222,'2022 Summer': 2225,'2022 Fall': 2228}
semester_dict = {v: k for k, v in semester_dict.items()}
df['semester_clean'] = df.semester_year_term_cd.map(semester_dict)
df['course_clean'] = df.subject_desc.map(str)  + ' ' + df['course_number'].map(str) 

## filter data to only contain actions that affect student enrollment status & are initiated by students

df_filtered = df[ (df['action_affects_enrollment_status_flag']=='Y') & (df['enrollment_intitiator_type']=='Student')]


In [None]:
## load add drop calendar
add_drop_cal = pd.read_excel('add drop calendar fa16-fa22 (1) (1).xlsx')


### split transaction data into enrollment phases 

phase 1, phase 2, add-drop period, late add-drop

In [None]:
during_phase_1_enroll = df_filtered_w_ddl[(df_filtered_w_ddl['enrollment_request_tmsp']>=df_filtered_w_ddl['phase 1 starts']) & (df_filtered_w_ddl['enrollment_request_tmsp']<=df_filtered_w_ddl['phase 1 ends'])]
during_phase_2_enroll = df_filtered_w_ddl[(df_filtered_w_ddl['enrollment_request_tmsp']>=df_filtered_w_ddl['phase 2 starts']) & (df_filtered_w_ddl['enrollment_request_tmsp']<=df_filtered_w_ddl['phase 2 ends'])]
add_drop = df_filtered_w_ddl[(df_filtered_w_ddl['enrollment_request_tmsp']>=df_filtered_w_ddl['adjustment starts']) & (df_filtered_w_ddl['enrollment_request_tmsp']<=df_filtered_w_ddl['Deadline to add/drop or swap/change units for classes – undergrad'])]
late_drop = df_filtered_w_ddl[(df_filtered_w_ddl['enrollment_request_tmsp']>=df_filtered_w_ddl['Deadline to add/drop or swap/change units for classes – undergrad'])]

In [None]:
def get_std_of_diff(tmsp_series):
    return pd.Series(tmsp_series).diff().std()

In [None]:
import numpy as np
def get_enroll_action_range_std_result_df(df, phase_start_col, phase_end_col, phase_name):
    
   
    '''
    get the normalized range and regularity of activities (std of difference between the actions)
    
    inputs:
        - df = transaction data of a specific enrollment phase
        - phase_start_col(str) = the column that indicates the date for the beginning of the phase
        - phase_end_col(str) = the column that indicates the dates for the end of the phase
        - phase_name (str)
    outputs:
        - result dataframe with the columns:
            'semester_clean', 'student_id', 'days_range_'+phase_name, 'std_'+phase_name,'diff_std_'+phase_name, 'std_normalized_'+ phase_name,'diff_std_normalized_'+ phase_name,'range_normalized_'+ phase_name
    
    
    '''
    enroll_range_std_df = df.sort_values('enrollment_request_tmsp', ascending = True).groupby(['semester_clean', 'student_id']).agg({'enrollment_request_tmsp':['first', 'last', np.std,get_std_of_diff ]}).reset_index()
    enroll_range_std_df.columns = enroll_range_std_df.columns.map(''.join)
    
    df['length_phase'] = (df[phase_end_col] - df[phase_start_col])
    
    semester_phase_len = df[[ 'semester_clean','length_phase']].drop_duplicates()
    
    enroll_range_std_df = enroll_range_std_df.merge(semester_phase_len, on ='semester_clean')
    
    enroll_range_std_df['days_range'] = (enroll_range_std_df['enrollment_request_tmsplast'] - enroll_range_std_df['enrollment_request_tmspfirst'])

    enroll_range_std_df.rename(columns = {'enrollment_request_tmspstd': 'std_'+phase_name, 'enrollment_request_tmspget_std_of_diff':'diff_std_'+phase_name}, inplace = True)
    
    enroll_range_std_df['std_normalized_'+ phase_name] = enroll_range_std_df['std_'+phase_name] / enroll_range_std_df['length_phase']
    
    enroll_range_std_df['diff_std_normalized_'+ phase_name] = enroll_range_std_df['diff_std_'+phase_name] / enroll_range_std_df['length_phase']
    
    enroll_range_std_df['range_normalized_' + phase_name] = enroll_range_std_df['days_range'] / enroll_range_std_df['length_phase']
    
    enroll_range_std_df.rename(columns = {'days_range':'days_range_'+phase_name},inplace = True)
    
    return enroll_range_std_df[['semester_clean', 'student_id', 'days_range_'+phase_name, 'std_'+phase_name,'diff_std_'+phase_name, 'std_normalized_'+ phase_name,'diff_std_normalized_'+ phase_name,'range_normalized_'+ phase_name ]]
     

In [None]:
enroll_action_range_std_result_df_p1 = get_enroll_action_range_std_result_df(during_phase_1_enroll, 'phase 1 starts', 'phase 2 starts', 'p1', None)

In [None]:
enroll_action_range_std_result_df_p2 = get_enroll_action_range_std_result_df(during_phase_2_enroll,'phase 2 starts', 'phase 2 ends', 'phase_2', None)


In [None]:
enroll_action_range_std_result_df_add_drop = get_enroll_action_range_std_result_df(add_drop,'adjustment starts', 'Deadline to add/drop or swap/change units for classes – undergrad', 'add_drop', None)


### get procrastinator student-semester pairs

In [None]:
df_modal = enroll_action_range_std_result_df_p1

In [None]:
relative_loc_cols = df_modal.filter(regex='relative_location').columns

#### we define a procrastinator as someone that enrolls/waitlists later than the median time during the enrollment phase

In [None]:
procastinator_student_sems = []
for rel_loc in relative_loc_cols:
    rel_loc_med = df_modal[rel_loc].median()
    procastinator_student_sems.append(df_modal[df_modal[rel_loc]>rel_loc_med][['semester_clean','student_id' ]]) 

In [None]:
procastinator_student_sems = pd.concat(procastinator_student_sems).drop_duplicates()

In [None]:
procastinator_student_sems.to_csv('procrast_student_sems.csv')

the student-semester pairs can then be used as to plot basket size/workload of procrastinators vs. non-procrastinators