In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
path='/data/groups/CTd/'

## Read in data


In [None]:
def process_file(f, suffix):
    tmp=pd.read_csv(path+f,low_memory=False)
    if 'Unnamed: 0' in tmp.columns:
        tmp.drop(columns=['Unnamed: 0'], inplace=True)
    if suffix!='':
        tmp.columns=[c+suffix if c not in ['semester_clean', 'student_id'] else c for c in tmp.columns]
    return tmp

In [None]:
fs = [
    ('procrastinator_modal_activity_range_std_D_all_phases.csv', ''),
    ('procrastinator_modal_activity_range_std_W_all_phases.csv', ''),
    ('procrastinator_modal_activity_range_std_E_all_phases.csv', ''),
    ('procrastinator_modal_activity_range_std_all_actions_all_phases.csv', ''),
    ('procrastinator_modal_activity_range_std_all_actions.csv', ''),
    ('procrastinator_modal_activity_range_std_D.csv', ''),
    ('procrastinator_modal_activity_range_std_W.csv', ''),
    ('procrastinator_modal_activity_range_std_E.csv', '')
]

In [None]:
df_outcomes=pd.read_csv(path+'student_semester_dropped_courses_units.csv').fillna(0)
for index, (f, suff) in tqdm(enumerate(fs)):
    tmp=process_file(f, suff)
    df_outcomes=df_outcomes.merge(tmp,how='left',on=['student_id','semester_clean'])

In [None]:
columns = ['semester_clean',
 'student_id',
 'relative_location_btw_phase1_add_drop_ddl_D_all_phases',
 'relative_location_btw_phase1_add_drop_ddl_E_all_phases',
 'relative_location_btw_phase1_add_drop_ddl_all_actions_all_phases',
 'diff_std_normalized_btw_phase1_add_drop_ddl_E_all_phases',
 'diff_std_normalized_btw_phase1_add_drop_ddl_D_all_phases',
 'diff_std_normalized_btw_phase1_add_drop_ddl_all_actions_all_phases',
 'total_dropped_units', 'total_late_dropped_units']

In [None]:
df_outcomes_cleaned = df_outcomes[columns].copy()

In [None]:
[c for c in df_outcomes_cleaned.columns if c in ['semester_clean', 'student_id'] or 'relative_location' in c or 'diff_std_normalized' in c]

## Control variables for regression modeling and preparing export

In [None]:
df_outcomes_cleaned['phase1_add_drop_noE_SD'] =\
    df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_E_all_phases']\
    .map(lambda x: 1 if pd.isna(x) else 0)

df_outcomes_cleaned['phase1_add_drop_noD_SD'] =\
    df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_D_all_phases']\
    .map(lambda x: 1 if pd.isna(x) else 0)

In [None]:
df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_E_all_phases'] =\
    df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_E_all_phases']\
    .map(lambda x: 0 if pd.isna(x) else x)

df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_D_all_phases'] =\
    df_outcomes_cleaned['diff_std_normalized_btw_phase1_add_drop_ddl_D_all_phases']\
    .map(lambda x: 0 if pd.isna(x) else x)

In [None]:
# Remove NA
df_model=df_outcomes_cleaned.dropna()

## Export for Further Processing

In [None]:
# Export
df_model.to_csv('df_model-procrastination-v1.csv', index=False)

## Longitudinal Workload Aggregation

In [None]:
import seaborn as sns

import glob

files = glob.glob("/data/groups/CTd/daily_basket_cla_ch/*student_daily_basket_cla_ch.csv")
print(files)

import numpy as np
import pandas as pd
import tqdm
dfs = []
for f in tqdm.tqdm(files):
    dfs.append(pd.read_csv(f))

df_concat = pd.concat(dfs)

# df_concat = df_concat[['student_id', 'semester_clean','day_num', 'tl', 'me', 'ps', 'cl', 'ch']]
df_concat.replace(0, np.nan, inplace=True)

# Export time series of load per student
df_concat['cl_standardized'] = (df_concat['cl'] - df_concat['cl'].mean(skipna=True)) / df_concat['cl'].std(skipna=True)
df_concat['ch_standardized'] = (df_concat['ch'] - df_concat['ch'].mean(skipna=True)) / df_concat['ch'].std(skipna=True)
df_concat['cla_disc'] = df_concat['cl_standardized'] - df_concat['ch_standardized']

# Take max day --> equal to late add/drop deadline ('change to grading option')
d_sem_lastday = df_concat.groupby('semester_clean').day_num.max().to_dict()

df_concat['is_deadline_day'] = df_concat['semester_clean'].map(d_sem_lastday) == df_concat['day_num']

df_workloads_export = df_concat\
    [df_concat['is_deadline_day']]\
    [['student_id', 'semester_clean', 'cl', 'ch', 'cla_disc']]\
    .dropna()\
    .groupby(['student_id', 'semester_clean'])\
    [['cl', 'ch', 'cla_disc']].mean().reset_index() # nunique() returns 1. Checked

df_workloads_export.to_csv('/data/groups/CTd/longitudinal-workloads-jedm-v3.csv', index=False)