In [None]:
import pandas as pd
import numpy as np

In [None]:
from tqdm import tqdm

In [None]:
df = pd.read_csv('df_model-procrastination-cb-v1.csv')

In [None]:
df_workloads = pd.read_csv('/data/groups/CTd/longitudinal-workloads-jedm-v2.csv')

In [None]:
df_analysis = df[['student_id', 'semester_clean', 
    'relative_location_btw_phase1_add_drop_ddl_E_all_phases',
   'total_late_dropped_units']].rename(columns={
    'relative_location_btw_phase1_add_drop_ddl_E_all_phases': 'enrollment_procrastination',
    'total_late_dropped_units': 'late_dropped_units'
}).merge(df_workloads, how='left', on=['student_id', 'semester_clean']).dropna()

## What course are late dropped out of the basket?

In [None]:
fa_17_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2017 Fall.csv')
sp_17_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2017 Spring.csv')
fa_18_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2018 Fall.csv')
sp_18_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2018 Spring.csv')
fa_19_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2019 Fall.csv')
sp_19_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2019 Spring.csv')
fa_20_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2020 Fall.csv')
sp_20_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2020 Spring.csv')
sp_21_course_load = pd.read_csv('/data/groups/CTd/cla-predictions/predicted-course-loads-2021 Spring.csv')

course_load_df_dict={'2017 Fall': fa_17_course_load,
                    '2017 Spring': sp_17_course_load,
                     '2018 Fall': fa_18_course_load,
                     '2018 Spring': sp_18_course_load,
                    '2019 Fall': fa_19_course_load,
                     '2019 Spring': sp_19_course_load,
                     '2020 Fall': fa_20_course_load,
                     '2020 Spring': sp_20_course_load,
                     '2021 Spring': sp_21_course_load,
                    }

course_load_concat = pd.concat(course_load_df_dict).reset_index()
course_load_concat = course_load_concat[['level_0','course_name_number', 'tl1', 'me', 'ps',
       'cl_combined']]
course_load_concat.columns=['semester_clean', 'course_clean','time_load','mental_effort','psych_stress', 'cl_combined']

In [None]:
cla_dict = {}
for _, row in course_load_concat.iterrows():
    semester = row["semester_clean"]
    course = row["course_clean"]
    cl_combined = row["cl_combined"]
    if semester not in cla_dict:
        cla_dict[semester] = {}
    cla_dict[semester][course] = cl_combined

In [None]:
# Add overall
all_courses = set()
for semester, d_cla in cla_dict.items():
    for course, cla in d_cla.items():
        all_courses.add(course)
cla_overall = dict()
for a_course in tqdm(all_courses):
    course_cla = []
    for semester, d_cla in cla_dict.items():
        for course, cla in d_cla.items():
            if course == a_course:
                course_cla.append(cla)
    cla_overall[a_course] = np.mean(course_cla)
    #print(len(course_cla))

In [None]:
cla_dict['overall'] = cla_overall

In [None]:
all_courses = pd.read_pickle('/data/groups/CTd/add_drop_courses_by_student_semester.pkl')

In [None]:
dropped_courses = pd.read_pickle('/data/groups/CTd/late_dropped_courses_by_student_semester.pkl')

In [None]:
df_courses = all_courses.merge(dropped_courses, how='outer', on=['student_id', 'semester_clean'])

In [None]:
df_courses.rename(columns={'course_clean_x': 'all_courses', 'course_clean_y': 'dropped_courses'}, inplace=True)

In [None]:
def get_retained_courses(row):
    if not isinstance(row['dropped_courses'], list):
        return row['all_courses']
    if not isinstance(row['all_courses'], list):
        return row['all_courses']
    all_courses_set = set(row['all_courses'])
    dropped_courses_set = set(row['dropped_courses'])
    retained_courses_set = all_courses_set - dropped_courses_set
    return list(retained_courses_set)
df_courses['retained_courses'] = df_courses.apply(lambda row: get_retained_courses(row), axis=1)

In [None]:
all_courses_cla, dropped_courses_cla = [], []
for _, row in tqdm(df_courses.iterrows()):
    sem = row['semester_clean']
    row_all_courses_cla = []
    if not isinstance(row['all_courses'], list):
        all_courses_cla.append(['NONE'])
    else:
        for course in row['all_courses']:
            if sem in cla_dict:
                row_all_courses_cla.append(cla_dict.get(sem).get(course))
            else:
                row_all_courses_cla.append(cla_dict.get('overall').get(course))
        all_courses_cla.append(row_all_courses_cla)
    row_dropped_courses_cla = []
    if not isinstance(row['dropped_courses'], list):
        dropped_courses_cla.append(['NONE'])
        continue 
    for course in row['dropped_courses']:
        if sem in cla_dict:
            row_dropped_courses_cla.append(cla_dict.get(sem).get(course))
        else:
            row_dropped_courses_cla.append(cla_dict.get('overall').get(course))
    dropped_courses_cla.append(row_dropped_courses_cla)
df_courses['all_courses_cla'] = all_courses_cla
df_courses['dropped_courses_cla'] = dropped_courses_cla

In [None]:
retained_courses_cla = []
for _, row in tqdm(df_courses.iterrows()):
    sem = row['semester_clean']
    row_retained_courses_cla = []
    added=False
    if not isinstance(row['retained_courses'], list):
        retained_courses_cla.append(['NONE'])
    else:
        for course in row['retained_courses']:
            if sem in cla_dict:
                row_retained_courses_cla.append(cla_dict.get(sem).get(course))
            else:
                row_retained_courses_cla.append(cla_dict.get('overall').get(course))
        retained_courses_cla.append(row_retained_courses_cla)
df_courses['retained_courses_cla'] = retained_courses_cla

In [None]:
df_courses_latedrop = df_courses[df_courses['dropped_courses'].map(lambda s: isinstance(s, list))].copy()

In [None]:
def remove_none_and_cast_to_float(lst):
    return [float(item) for item in lst if item is not None and item!='NONE']

In [None]:
df_courses_latedrop['cla_all_m'] = df_courses_latedrop['all_courses_cla'].map(lambda l: np.mean(remove_none_and_cast_to_float(l)))
df_courses_latedrop['cla_dropped_m'] = df_courses_latedrop['dropped_courses_cla'].map(lambda l: np.mean(remove_none_and_cast_to_float(l)))
df_courses_latedrop['cla_retained_m'] = df_courses_latedrop['retained_courses_cla'].map(lambda l: np.mean(remove_none_and_cast_to_float(l)))

In [None]:
df_courses_latedrop['cla_all_s'] = df_courses_latedrop['all_courses_cla'].map(lambda l: np.sum(remove_none_and_cast_to_float(l)))
df_courses_latedrop['cla_dropped_s'] = df_courses_latedrop['dropped_courses_cla'].map(lambda l: np.sum(remove_none_and_cast_to_float(l)))
df_courses_latedrop['cla_retained_s'] = df_courses_latedrop['retained_courses_cla'].map(lambda l: np.sum(remove_none_and_cast_to_float(l)))

In [None]:
procrast_E = pd.read_csv('/data/groups/CTd/E_procrast_student_sems.csv').drop(columns=['Unnamed: 0'])

In [None]:
procrast_E['enrollment_procrast'] = 1

In [None]:
df_courses_latedrop_procrast = df_courses_latedrop.merge(procrast_E, how = 'left', on = ['semester_clean', 'student_id'])

In [None]:
df_courses_latedrop_procrast['enrollment_procrast'] =\
    df_courses_latedrop_procrast.enrollment_procrast.map(lambda x: 1 if not pd.isna(x) else 0)

In [None]:
df_courses_latedrop_procrast\
    .groupby('enrollment_procrast')\
    [['cla_all_m', 'cla_dropped_m', 'cla_retained_m', 'cla_dropped_s', 'cla_retained_s']].mean()

## Workload Differences by Procrastination Group

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Group by 'enrollment_procrast' and calculate means
group_means = df_courses_latedrop_procrast.groupby('enrollment_procrast')[['cla_retained_m', 'cla_dropped_m']].mean().reset_index()

# Function to calculate Cohen's d
def cohens_d(x, y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    pooled_std = np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)
    return (np.mean(x) - np.mean(y)) / pooled_std

# Split the DataFrame by 'enrollment_procrast' groups
df_non_procrast = df_courses_latedrop_procrast[df_courses_latedrop_procrast['enrollment_procrast'] == 0]
df_procrast = df_courses_latedrop_procrast[df_courses_latedrop_procrast['enrollment_procrast'] == 1]

# Calculate Cohen's d for 'cla_all' vs 'cla_dropped' within each group
cohens_d_non_procrast = cohens_d(df_non_procrast['cla_retained_m'], df_non_procrast['cla_dropped_m'])
cohens_d_procrast = cohens_d(df_procrast['cla_retained_m'], df_procrast['cla_dropped_m'])

# Plotting
fig, ax = plt.subplots()

# Interaction plot
label_d = {'cla_retained_m': 'Retained', 'cla_dropped_m': 'Dropped'}
for label in ['cla_retained_m', 'cla_dropped_m']:
    ax.plot(group_means['enrollment_procrast'], group_means[label], label=label_d.get(label), marker='o')

# Annotate Cohen's d for each group
ax.text(0.1, (group_means.iloc[0]['cla_retained_m'] + group_means.iloc[0]['cla_dropped_m'])/2,
        f"Cohen's d: {abs(cohens_d_non_procrast):.2f}",
        horizontalalignment='center', verticalalignment='center', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

ax.text(0.9, (group_means.iloc[1]['cla_retained_m'] + group_means.iloc[1]['cla_dropped_m'])/2,
        f"Cohen's d: {abs(cohens_d_procrast):.2f}",
        horizontalalignment='center', verticalalignment='center', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

# Customization
ax.set_xticks(group_means['enrollment_procrast'])
ax.set_xticklabels(['Non-Procrastinator', 'Procrastinator'])
ax.set_xlabel('Enrollment Procrastination')
ax.set_ylabel('Average Predicted Course Load')
ax.set_title('Retained vs. Dropped Courses of Late-Dropping Students')
ax.legend(title='', bbox_to_anchor=(0.5, 0.5), loc='center', ncol=1)
plt.grid(True)
plt.show()


In [None]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

df_long = pd.melt(df_courses_latedrop_procrast, id_vars=['enrollment_procrast'], value_vars=['cla_retained_m', 'cla_dropped_m'],
                  var_name='course_type', value_name='score')

model = ols('score ~ C(enrollment_procrast) * C(course_type)', data=df_long).fit()
anova_results = anova_lm(model, typ=2)

print(anova_results)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming df_courses_latedrop_procrast is already defined

# Group by 'enrollment_procrast' and calculate means
group_means = df_courses_latedrop_procrast.groupby('enrollment_procrast')[['cla_retained_s', 'cla_dropped_s']].mean().reset_index()

# Function to calculate Cohen's d
def cohens_d(x, y):
    ans = np.sum(y)/np.sum([x, y])
    return round(ans*100, 1)

# Split the DataFrame by 'enrollment_procrast' groups
df_non_procrast = df_courses_latedrop_procrast[df_courses_latedrop_procrast['enrollment_procrast'] == 0]
df_procrast = df_courses_latedrop_procrast[df_courses_latedrop_procrast['enrollment_procrast'] == 1]

# Calculate Cohen's d for 'cla_all' vs 'cla_dropped' within each group
cohens_d_non_procrast = cohens_d(df_non_procrast['cla_retained_s'], df_non_procrast['cla_dropped_s'])
cohens_d_procrast = cohens_d(df_procrast['cla_retained_s'], df_procrast['cla_dropped_s'])

# Plotting
fig, ax = plt.subplots()

# Interaction plot
label_d = {'cla_retained_s': 'Retained', 'cla_dropped_s': 'Dropped'}
for label in ['cla_retained_s', 'cla_dropped_s']:
    ax.plot(group_means['enrollment_procrast'], group_means[label], label=label_d.get(label), marker='o')

# Annotate Cohen's d for each group
ax.text(0.1, (group_means.iloc[0]['cla_retained_s'] + group_means.iloc[0]['cla_dropped_s'])/2,
        f"Dropped: {cohens_d_non_procrast:.1f}%",
        horizontalalignment='center', verticalalignment='center', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

ax.text(0.9, (group_means.iloc[1]['cla_retained_s'] + group_means.iloc[1]['cla_dropped_s'])/2,
        f"Dropped: {cohens_d_procrast:.1f}%",
        horizontalalignment='center', verticalalignment='center', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

# Customization
ax.set_xticks(group_means['enrollment_procrast'])
ax.set_xticklabels(['Non-Procrastinator', 'Procrastinator'])
ax.set_xlabel('Enrollment Procrastination')
ax.set_ylabel('Sum Predicted Course Load')
ax.set_title('Retained vs. Dropped Workload of Late-Dropping Students')
ax.legend(title='', bbox_to_anchor=(0.5, 0.4), loc='center', ncol=1)
plt.grid(True)
plt.show()


In [None]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

df_long = pd.melt(df_courses_latedrop_procrast, id_vars=['enrollment_procrast'], value_vars=['cla_retained_s', 'cla_dropped_s'],
                  var_name='course_type', value_name='score')

model = ols('score ~ C(enrollment_procrast) * C(course_type)', data=df_long).fit()
anova_results = anova_lm(model, typ=2)

print(anova_results)

### Workload Ranking analysis

In [None]:
# 2.5 average imputation for missing courses
def remove_none_and_cast_to_float(lst):
    return [float(item) for item in lst if item is not None and item!='NONE'] \
           + [2.5 for item in lst if item is None or item=='NONE']

In [None]:
def remove_none(lst):
    return [item for item in lst if item is not None and item!='NONE']

In [None]:
# Function to rank courses and propagate ranks
def rank_and_propagate(row):
    all_courses = row['all_courses']
    all_courses_cla = row['all_courses_cla']

    if not isinstance(all_courses, list):
        row['dropped_courses_ranks'] = np.nan
        row['retained_courses_ranks'] = np.nan
        return row

    all_courses = remove_none(all_courses)
    all_courses_cla = remove_none_and_cast_to_float(all_courses_cla)

    # Create course-cla pairs and sort by CLA in descending order
    course_cla_pairs = sorted(zip(all_courses, all_courses_cla), key=lambda x: x[1], reverse=True)

    # Create ranking dictionary
    ranking_dict = {course: rank + 1 for rank, (course, cla) in enumerate(course_cla_pairs)}

    # Function to map courses to their ranks
    def map_to_ranks(courses, ranking_dict):
        return [ranking_dict[course] for course in courses if course in ranking_dict]
    
    # Apply mapping to dropped and retained courses
    dropped_courses_ranks = map_to_ranks(remove_none(row['dropped_courses']), ranking_dict)
    retained_courses_ranks = map_to_ranks(remove_none(row['retained_courses']), ranking_dict)

    # Combine dropped and retained courses ranks
    combined_ranks = dropped_courses_ranks + retained_courses_ranks

    # Sort combined ranks
    combined_ranks_sorted = sorted(combined_ranks)

    # Reassign ranks
    rank_map = {old_rank: new_rank + 1 for new_rank, old_rank in enumerate(combined_ranks_sorted)}

    # Map the new ranks back to dropped and retained courses
    row['dropped_courses_ranks'] = [rank_map[rank] for rank in dropped_courses_ranks]
    row['retained_courses_ranks'] = [rank_map[rank] for rank in retained_courses_ranks]
    return row

# Apply the function row-wise
tqdm.pandas()
df_courses_latedrop_rank = df_courses_latedrop.progress_apply(rank_and_propagate, axis=1)

In [None]:
from collections import Counter

all_ranks = df_courses_latedrop_rank.dropped_courses_ranks.explode().tolist()
all_ranks = [rank for rank in all_ranks if rank is not None]
rank_counts = Counter(all_ranks)

rank_counts

all_ranks = df_courses_latedrop_rank.retained_courses_ranks.explode().tolist()
all_ranks = [rank for rank in all_ranks if rank is not None]
rank_counts = Counter(all_ranks)

In [None]:
df_courses_latedrop_rank2 = df_courses_latedrop_rank.merge(df_courses_latedrop_procrast[['student_id', 'semester_clean', 'enrollment_procrast']],
                               how='left', on=['student_id', 'semester_clean'])

In [None]:
# Import necessary libraries
import pandas as pd
from scipy.stats import ranksums
import matplotlib.pyplot as plt
import numpy as np

# Assuming df_courses_latedrop_rank is your DataFrame
# Extracting the ranks for procrastination group (enrollment_procrast == 1)
tmp_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast']==1].copy()

# Extracting the ranks for non-procrastination group (enrollment_procrast == 0)
tmp_non_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast']==0].copy()

# Dropped and retained ranks for procrastination group
dropped_ranks_procrast = tmp_procrast.dropped_courses_ranks.explode().dropna().tolist()
retained_ranks_procrast = tmp_procrast.retained_courses_ranks.explode().dropna().tolist()

# Dropped and retained ranks for non-procrastination group
dropped_ranks_non_procrast = tmp_non_procrast.dropped_courses_ranks.explode().dropna().tolist()
retained_ranks_non_procrast = tmp_non_procrast.retained_courses_ranks.explode().dropna().tolist()

In [None]:
# Filtering ranks less than 6
dropped_ranks_procrast = [r for r in dropped_ranks_procrast if r < 6]
retained_ranks_procrast = [r for r in retained_ranks_procrast if r < 6]

dropped_ranks_non_procrast = [r for r in dropped_ranks_non_procrast if r < 6]
retained_ranks_non_procrast = [r for r in retained_ranks_non_procrast if r < 6]

# Define bin edges and bin centers
bins = range(1, 7)  # Bins from 1 to 6
bin_centers = [(bins[i] + bins[i + 1]) / 2 for i in range(len(bins) - 1)]

# Visualizing the rank distributions using histograms in two panels
plt.rcParams.update({'font.size': 16})
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharey=True)

# Plotting histogram for dropped ranks in procrastination group
axes[0].hist(dropped_ranks_procrast, bins=bins, alpha=0.75, edgecolor='black', rwidth=0.8)
axes[0].set_title('Dropped Courses (Procrastination)')
axes[0].set_xlabel('Rank')
axes[0].set_ylabel('Number of Courses')
axes[0].set_xticks(bin_centers)
axes[0].set_xticklabels([str(int(center)) for center in bin_centers])

# Plotting histogram for dropped ranks in non-procrastination group
axes[1].hist(dropped_ranks_non_procrast, bins=bins, alpha=0.75, edgecolor='black', rwidth=0.8)
axes[1].set_title('Dropped Courses (Non-Procrastination)')
axes[1].set_xlabel('Rank')
axes[1].set_xticks(bin_centers)
axes[1].set_xticklabels([str(int(center)) for center in bin_centers])

plt.show()

In [None]:
def plot_ranks():
    # Import necessary libraries
    import pandas as pd
    import matplotlib.pyplot as plt

    def get_data(n_courses_total):
        df_courses_latedrop_rank3 = df_courses_latedrop_rank2[
            (df_courses_latedrop_rank2.dropped_courses_ranks.map(lambda x: 0 if not isinstance(x, list) else len(x)) +
             df_courses_latedrop_rank2.retained_courses_ranks.map(lambda x: 0 if not isinstance(x, list) else len(x))) == n_courses_total]

        # Extracting the ranks for procrastination group (enrollment_procrast == 1)
        tmp_procrast = df_courses_latedrop_rank3[df_courses_latedrop_rank3['enrollment_procrast'] == 1].copy()
        
        # Extracting the ranks for non-procrastination group (enrollment_procrast == 0)
        tmp_non_procrast = df_courses_latedrop_rank3[df_courses_latedrop_rank3['enrollment_procrast'] == 0].copy()
        
        # Dropped and retained ranks for procrastination group
        dropped_ranks_procrast = tmp_procrast.dropped_courses_ranks.explode().dropna().tolist()
        retained_ranks_procrast = tmp_procrast.retained_courses_ranks.explode().dropna().tolist()
        
        # Dropped and retained ranks for non-procrastination group
        dropped_ranks_non_procrast = tmp_non_procrast.dropped_courses_ranks.explode().dropna().tolist()
        retained_ranks_non_procrast = tmp_non_procrast.retained_courses_ranks.explode().dropna().tolist()
        
        # Value counts for ranks
        dropped_ranks_procrast_count = pd.Series(dropped_ranks_procrast).value_counts(normalize=True) * 100
        retained_ranks_procrast_count = pd.Series(retained_ranks_procrast).value_counts(normalize=True) * 100
        
        dropped_ranks_non_procrast_count = pd.Series(dropped_ranks_non_procrast).value_counts(normalize=True) * 100
        retained_ranks_non_procrast_count = pd.Series(retained_ranks_non_procrast).value_counts(normalize=True) * 100
        
        # Combine counts for plotting
        dropped_ranks_count = pd.DataFrame({
            'Procrastination': dropped_ranks_procrast_count,
            'Non-Procrastination': dropped_ranks_non_procrast_count
        }).fillna(0).sort_index()
        
        retained_ranks_count = pd.DataFrame({
            'Procrastination': retained_ranks_procrast_count,
            'Non-Procrastination': retained_ranks_non_procrast_count
        }).fillna(0).sort_index()

        return dropped_ranks_count

    # Define n_courses_total values to iterate over
    nns = [4, 5, 6, 7]
    
    # Setting up the plot
    plt.rcParams.update({'font.size': 14})
    fig, axes = plt.subplots(nrows=1, ncols=len(nns), figsize=(20, 6), sharey=True)
    
    for i, n in enumerate(nns):
        dropped_ranks_count = get_data(n)
        dropped_ranks_count.plot(kind='bar', ax=axes[i], alpha=0.75, edgecolor='black', legend=False, rot=0)
        axes[i].set_title(f'n={n}')
        axes[i].set_xlabel('Predicted Workload Rank')
        if i == 0:
            axes[i].set_ylabel('% of Courses')
        
        # Draw a horizontal line at y = 100 / n
        axes[i].axhline(y=100/n, color='red', linestyle='--')

    # Single title for all plots
    fig.suptitle('', fontsize=20)
    
    # Single color legend
    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', ncol=2, title='Group')

    plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust layout to accommodate title and legend
    plt.show()


In [None]:
plot_ranks()

In [None]:
# Dropped and retained courses count for procrastination group

tmp_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast'] == 1].copy()
tmp_non_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast'] == 0].copy()

dropped_courses_procrast = tmp_procrast['dropped_courses'].map(len).value_counts()
retained_courses_procrast = tmp_procrast['retained_courses'].map(len).value_counts()

# Dropped and retained courses count for non-procrastination group
dropped_courses_non_procrast = tmp_non_procrast['dropped_courses'].map(lambda x: 0 if not isinstance(x, list) else len(x)).value_counts()
retained_courses_non_procrast = tmp_non_procrast['retained_courses'].map(lambda x: 0 if not isinstance(x, list) else len(x)).value_counts()

# Combine counts for plotting
dropped_courses_count = pd.DataFrame({
    'Procrastination': dropped_courses_procrast,
    'Non-Procrastination': dropped_courses_non_procrast
}).fillna(0).astype(int)

retained_courses_count = pd.DataFrame({
    'Procrastination': retained_courses_procrast,
    'Non-Procrastination': retained_courses_non_procrast
}).fillna(0).astype(int)

# Calculate percentages
retained_courses_count['Procrastination'] = retained_courses_count['Procrastination'] / sum(retained_courses_count['Procrastination']) * 100
retained_courses_count['Non-Procrastination'] = retained_courses_count['Non-Procrastination'] / sum(retained_courses_count['Non-Procrastination']) * 100

dropped_courses_count['Procrastination'] = dropped_courses_count['Procrastination'] / sum(dropped_courses_count['Procrastination']) * 100
dropped_courses_count['Non-Procrastination'] = dropped_courses_count['Non-Procrastination'] / sum(dropped_courses_count['Non-Procrastination']) * 100

# Visualizing the number of courses using bar plots in two panels
plt.rcParams.update({'font.size': 16})
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharey=True)

# Plotting bar plot for dropped courses
dropped_courses_count.loc[1:10].plot(kind='bar', ax=axes[0], alpha=0.75, edgecolor='black')
axes[0].set_title('Dropped Courses')
axes[0].set_xlabel('# Courses')
axes[0].set_ylabel('% of Student Semesters')
axes[0].legend(title='Group')

# Plotting bar plot for retained courses
retained_courses_count.loc[1:10].plot(kind='bar', ax=axes[1], alpha=0.75, edgecolor='black')
axes[1].set_title('Retained Courses')
axes[1].set_xlabel('# Courses')
axes[1].legend(title='Group')

plt.show()

In [None]:
# Extracting the data for procrastination group (enrollment_procrast == 1)
tmp_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast'] == 1].copy()

# Extracting the data for non-procrastination group (enrollment_procrast == 0)
tmp_non_procrast = df_courses_latedrop_rank2[df_courses_latedrop_rank2['enrollment_procrast'] == 0].copy()

# Dropped and retained courses count for procrastination group
dropped_courses_procrast = tmp_procrast['dropped_courses'].map(len).value_counts()
retained_courses_procrast = tmp_procrast['retained_courses'].map(len).value_counts()

# Dropped and retained courses count for non-procrastination group
dropped_courses_non_procrast = tmp_non_procrast['dropped_courses'].map(lambda x: 0 if not isinstance(x, list)  else len(x)).value_counts()
retained_courses_non_procrast = tmp_non_procrast['retained_courses'].map(lambda x: 0 if not isinstance(x, list) else len(x)).value_counts()

# Combine counts for plotting
dropped_courses_count = pd.DataFrame({
    'Procrastination': dropped_courses_procrast,
    'Non-Procrastination': dropped_courses_non_procrast
}).fillna(0).astype(int)

retained_courses_count = pd.DataFrame({
    'Procrastination': retained_courses_procrast,
    'Non-Procrastination': retained_courses_non_procrast
}).fillna(0).astype(int)

retained_courses_count['Procrastination'] = retained_courses_count['Procrastination']/sum(retained_courses_count['Procrastination'])*100
retained_courses_count['Non-Procrastination'] = retained_courses_count['Non-Procrastination']/sum(retained_courses_count['Non-Procrastination'])*100

dropped_courses_count['Procrastination'] = dropped_courses_count['Procrastination']/sum(dropped_courses_count['Procrastination'])*100
dropped_courses_count['Non-Procrastination'] = dropped_courses_count['Non-Procrastination']/sum(dropped_courses_count['Non-Procrastination'])*100

# Visualizing the number of courses using bar plots in two panels
plt.rcParams.update({'font.size': 16})
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharey=True)

# Plotting bar plot for dropped courses
dropped_courses_count.loc[1:10].plot(kind='bar', ax=axes[0], alpha=0.75, edgecolor='black')
axes[0].set_title('Dropped Courses')
axes[0].set_xlabel('# Courses')
axes[0].set_ylabel('% of Student Semesters')
axes[0].legend(title='Group')

# Plotting bar plot for retained courses
retained_courses_count.loc[1:10].plot(kind='bar', ax=axes[1], alpha=0.75, edgecolor='black')
axes[1].set_title('Retained Courses')
axes[1].set_xlabel('# Courses')
axes[1].legend(title='Group')

plt.show()

In [None]:
# Workload (courses, credits, workload) by 2x2 late drop x late shop

In [None]:
df_courses['late_dropped'] = df_courses.dropped_courses.map(lambda x: 1 if isinstance(x, list) else 0 if pd.isna(x) else 1)

df_courses['n_courses'] = df_courses['all_courses'].map(lambda x: len(x) if isinstance(x, list) else 0)

df_courses['cla_sum'] = df_courses.all_courses_cla.map(lambda l: np.sum(remove_none_and_cast_to_float(l)))

df_analysis_desc = df_courses[['student_id', 'semester_clean', 'late_dropped', 'n_courses', 'cla_sum']]\
    .merge(df[['student_id', 'semester_clean', 'relative_location_btw_phase1_add_drop_ddl_E_all_phases']], how='left', on=['student_id', 'semester_clean'])\
    .dropna()    

ref = np.median(df_analysis_desc['relative_location_btw_phase1_add_drop_ddl_E_all_phases'])

df_analysis_desc['procrast'] = df_analysis_desc.relative_location_btw_phase1_add_drop_ddl_E_all_phases\
    .map(lambda x: 1 if x>ref else 0)

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()

In [None]:
df_analysis_desc\
    .groupby(['procrast'])\
    [['n_courses', 'cla_sum']]\
    .mean()

In [None]:
df_analysis_desc\
    .groupby(['procrast'])\
    [['n_courses', 'cla_sum']]\
    .mean()/10.230532	

In [None]:
df_analysis_desc\
    .groupby(['procrast'])\
    [['n_courses', 'cla_sum']]\
    .mean()/27.172229

In [None]:
df_analysis_desc\
    .groupby(['late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()	

In [None]:
df_analysis_desc\
    .groupby(['late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()	/10.916221

In [None]:
df_analysis_desc\
    .groupby(['late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()	/28.997658

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()/10.136084 # Courses

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_sum']]\
    .mean()/26.913211 # CLA

In [None]:
df_courses['late_dropped'] = df_courses.dropped_courses.map(lambda x: 1 if isinstance(x, list) else 0 if pd.isna(x) else 1)

df_courses['n_courses'] = df_courses['all_courses'].map(lambda x: len(x) if isinstance(x, list) else 0)

df_courses['cla_retained'] = df_courses.retained_courses_cla.map(lambda l: np.sum(remove_none_and_cast_to_float(l)))

df_analysis_desc = df_courses[['student_id', 'semester_clean', 'late_dropped', 'n_courses', 'cla_retained']]\
    .merge(df[['student_id', 'semester_clean', 'relative_location_btw_phase1_add_drop_ddl_E_all_phases']], how='left', on=['student_id', 'semester_clean'])\
    .dropna()    

ref = np.median(df_analysis_desc['relative_location_btw_phase1_add_drop_ddl_E_all_phases'])

df_analysis_desc['procrast'] = df_analysis_desc.relative_location_btw_phase1_add_drop_ddl_E_all_phases\
    .map(lambda x: 1 if x>ref else 0)

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_retained']]\
    .mean()

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_retained']]\
    .mean()/26.913211

In [None]:
df_analysis_desc\
    .groupby(['procrast', 'late_dropped'])\
    [['n_courses', 'cla_retained']]\
    .mean()/15.416705

In [None]:
df_analysis_desc\
    .groupby(['procrast'])\
    [['n_courses', 'cla_retained']]\
    .mean()

In [None]:
df_analysis_desc\
    .groupby(['procrast'])\
    [['n_courses', 'cla_retained']]\
    .mean()/25.254182

In [None]:
df_analysis_desc\
    .groupby(['late_dropped'])\
    [['n_courses', 'cla_retained']]\
    .mean()	

In [None]:
df_analysis_desc\
    .groupby(['late_dropped'])\
    [['n_courses', 'cla_retained']]\
    .mean()	/28.997658

### What courses were late-added?

In [None]:
def get_adds():
    df = pd.read_csv('~/student_enrollment_requests_non_pii_data_may4.csv')
    semester_dict = {'2012 Fall': 2128,'2013 Spring': 2132,'2013 Summer': 2135,'2013 Fall': 2138,'2014 Spring': 2142,'2014 Summer': 2145,'2014 Fall': 2148,'2015 Spring': 2152,'2015 Summer': 2155,'2015 Fall': 2158,'2016 Spring': 2162,'2016 Summer': 2165,'2016 Fall': 2168,'2017 Spring': 2172,'2017 Summer': 2175,'2017 Fall': 2178,'2018 Spring': 2182,'2018 Summer': 2185,'2018 Fall': 2188,'2019 Spring': 2192,'2019 Summer': 2195,'2019 Fall': 2198,'2020 Spring': 2202,'2020 Summer': 2205,'2020 Fall': 2208,'2021 Spring': 2212,'2021 Summer': 2215,'2021 Fall': 2218,'2022 Spring': 2222,'2022 Summer': 2225,'2022 Fall': 2228}
    semester_dict = {v: k for k, v in semester_dict.items()}
    df['semester_clean'] = df.semester_year_term_cd.map(semester_dict)
    df['course_clean'] = df.subject_desc.map(str)  + ' ' + df['course_number'].map(str) 
    df_filtered = df[ (df['action_affects_enrollment_status_flag']=='Y') & (df['enrollment_intitiator_type']=='Student')]
    return df_filtered

In [None]:
df_adds = get_adds()

In [None]:
df_adds['enrollment_request_tmsp']

In [None]:
df_adds = df_adds.sort_values(by=['student_id', 'semester_clean', 'enrollment_request_tmsp'])

In [None]:
df_adds_e = df_adds[df_adds['student_enrollment_status_outcome_cd'] == 'E'].copy()

In [None]:
df_adds_e_s = df_adds_e[['student_id', 'semester_clean', 'course_clean']]

In [None]:
df_first_last = df_adds_e_s.groupby(['student_id', 'semester_clean']).agg(['first', 'last'])\
    .reset_index()

In [None]:
df_first_last.columns = ['_'.join(filter(None, col)) for col in df_first_last.columns]

In [None]:
retained_courses_cla = []
for _, row in tqdm(df_first_last.iterrows()):
    sem = row['semester_clean']
    course = row['course_clean_first']
    row_retained_courses_cla = []
    if sem in cla_dict:
        add = cla_dict.get(sem).get(course)
    else:
        add = cla_dict.get('overall').get(course)
    retained_courses_cla.append(add)
df_first_last['course_clean_first_cla'] = retained_courses_cla

In [None]:
retained_courses_cla = []
for _, row in tqdm(df_first_last.iterrows()):
    sem = row['semester_clean']
    course = row['course_clean_last']
    row_retained_courses_cla = []
    if sem in cla_dict:
        add = cla_dict.get(sem).get(course)
    else:
        add = cla_dict.get('overall').get(course)
    retained_courses_cla.append(add)
df_first_last['course_clean_last_cla'] = retained_courses_cla

In [None]:
df_join = df_first_last\
    .groupby(['student_id', 'semester_clean'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()\
    .reset_index()

In [None]:
df_analysis_desc2 = df_analysis_desc.merge(df_join, how='left', on=['student_id', 'semester_clean'])

In [None]:
df_analysis_desc2\
    .groupby(['procrast', 'late_dropped'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()

In [None]:
df_analysis_desc2\
    .groupby(['procrast'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()

In [None]:
df_analysis_desc2\
    .groupby(['procrast'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()/2.703285

In [None]:
df_analysis_desc2\
    .groupby(['procrast'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()/2.675813

In [None]:
df_analysis_desc2\
    .groupby(['late_dropped'])\
    [['course_clean_first_cla', 'course_clean_last_cla']]\
    .mean()/2.676555

## Export for Growth Curve Modeling in R

In [None]:
df_analysis['semester_clean'] = df_analysis.semester_clean.map(lambda s: s.replace('Fall', '2 Fall').replace('Spring', '1 Spring'))

In [None]:
df_analysis = df_analysis.sort_values(by=['student_id', 'semester_clean'])

In [None]:
df_analysis['semester_count'] = df_analysis.groupby('student_id').cumcount() + 1

In [None]:
df_analysis.to_csv('/data/groups/CTd/jedm-analysis-dataset-growthcurve-v3.csv', index=False)