In [None]:
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
import os
import sys
sys.path.append('../')

# THIS NOTEBOOK IS DEPRECATED

# Load Student Data

In [None]:
from src.d01_data.student_data_api import StudentDataApi

def isFocal(row):
    return (row['FRL'] + row['AALPI']) > 1

student_data_file = "/share/data/school_choice_equity/simulator_data/student/drop_optout_{}.csv"
period = "1819"
student_df = pd.read_csv(student_data_file.format(period)).set_index('studentno')
# mask = student_df['grade'] == 'KG'
# student_df = student_df.loc[mask]
student_df['focal'] = student_df.apply(lambda row: isFocal(row), axis=1).astype('int64')

student_df.head()

# Load Assignment Results

In [None]:
diversity_category_col = 'Diversity_Category3'  # diversity category column
program_cutoff = 'program_cutoff'  # program cutoff column (in points)
cutoff_tiebreaker = 'cutoff_tiebreaker'  # the cutoff tiebreaker of the program to which the student was assigned
focal_block = 'focal_block'  # if students gets equity tiebreaker
tiebreaker_status = 'status'  # if student counts as TP, TN, FP or FN

def get_specific_program_cutoff(x, diversity_category):
    if isinstance(x, float):
        return x
    x_list = x[1:-1].split()
    return float(x_list[diversity_category])

def check_tiebreaker(row):
    if diversity_category_col in row.index:
        diversity_category = row[diversity_category_col]
        cut_off = get_specific_program_cutoff(row[program_cutoff], diversity_category)
    else:
        cut_off = row[program_cutoff]
        
    if cut_off > 4:
        return "sibiling"
    elif cut_off > 3:
        return "equity+zone"
    elif cut_off > 2 :
        return "equity"
    elif cut_off > 1:
        return "zone"
    elif cut_off > 0:
        return "lottery"
    else:
        return "none"
    
def q1(x):
    return np.quantile(x, .25)
def q2(x):
    return np.quantile(x, .5)
def q3(x):
    return np.quantile(x, .75)

def augment_assigment(assignment_df, equity_tiebreaker):
    if equity_tiebreaker == 'none':
        student_df[equity_tiebreaker] = 0.
    elif equity_tiebreaker == 'test':
        student_df[equity_tiebreaker] = student_df['ctip1']
    assignment_df[cutoff_tiebreaker] = assignment_df.apply(lambda row: check_tiebreaker(row), axis=1, raw=False)
    assignment_df[focal_block] = student_df[equity_tiebreaker].reindex(assignment_df.index)
    assignment_df['focal'] = student_df['focal'].reindex(assignment_df.index)
    get_student_tiebreaker_status(assignment_df)
    
def get_student_tiebreaker_status(df):
    mask_focal = df['focal'] == 1
    mask_focal_block = df[focal_block] == 1
    df[tiebreaker_status] = "TN"
    df.at[mask_focal & mask_focal_block, tiebreaker_status] = "TP"
    df.at[~mask_focal & mask_focal_block, tiebreaker_status] = "FP"
    df.at[mask_focal & ~mask_focal_block, tiebreaker_status] = "FN"
   

In [None]:
assignments_dir = "/share/data/school_choice_equity/local_runs/Assignments/"
# assignments_dir = "/share/data/school_choice_equity/temp/Assignments/"

def get_filename_template(policy, guard_rails):
    if policy == "Con1":
        if guard_rails == 0:
            return "Assignment_{}_CTIP1_round_merged123_policyCon1GuardRails0-RealPref_tiesSTB_prefExtend0_iteration{}.csv"
        else:
            return "Assignment_{}_CTIP1_round_merged123_policyCon1-RealPref_tiesSTB_prefExtend0_iteration{}.csv"
    elif policy == "Medium1":
        if guard_rails == 0:
            return "Assignment_{}_CTIP1_round_merged123_policyMedium1GuardRails0-RealPref_tiesSTB_prefExtend0_iteration{}.csv"
        else:
            return "Assignment_{}_CTIP1_round_merged123_policyMedium1-RealPref_tiesSTB_prefExtend0_iteration{}.csv"
    

equity_tiebreaker_list = ['ctip1', 'knapsack008', 'pc1002_050', 'naive004']
num_iterations = 20
policy = "Medium1"
guard_rails = 0
filename_template = get_filename_template(policy, guard_rails)

# Assignment Summary

Get sum summary statistics by grouping by Diversity Category, Focal and Cutoff Tiebreaker (Program Cutoff). The summary statistics are counts, mean, min, Q1, Q2 (median), Q3 and max.

In [None]:
def get_summary_iteration(assignment_df, equity_tiebreaker):
    augment_assigment(assignment_df, equity_tiebreaker)
    evaluation_columns = [diversity_category_col, 'rank', 'designation', 'In-Zone Rank', 
    # evaluation_columns = ['rank', 'designation', 'In-Zone Rank', 
                          cutoff_tiebreaker, equity_tiebreaker, 'focal', 'iteration']

    group_columns = [diversity_category_col, 'focal']
    
    rank_funs = ['count','mean', 'min', q1, q2, q3, 'max']
    
    return assignment_df[evaluation_columns].groupby(group_columns).agg({'rank': rank_funs,
                                                                         equity_tiebreaker: 'mean'})

In [None]:
assignment_dict = dict()
summary_dict = dict()
for equity_tiebreaker in equity_tiebreaker_list:
    summary_df = []
    for iteration in range(num_iterations):
        filename = filename_template.format(equity_tiebreaker, iteration)
        assignment_df = pd.read_csv(assignments_dir+filename).set_index('studentno')
        assignment_df['iteration'] = iteration
        summary_df += [get_summary_iteration(assignment_df, equity_tiebreaker)]

    group_columns = [diversity_category_col, 'focal', 'tiebreaker']
    summary_df = pd.concat(summary_df, axis=0)
    
    summary_dict[equity_tiebreaker] = summary_df

In [None]:
display(HTML("<h3>Results grouped by focal</h3>" ))
for equity_tiebreaker in equity_tiebreaker_list:
    summary_df = summary_dict[equity_tiebreaker]
    display(HTML("<h4>Tiebreaker: %s</h4>" % equity_tiebreaker))
    display(summary_df)

# Individual Rank Data

In [None]:
def get_rank_iteration(assignment_df, equity_tiebreaker):
    augment_assigment(assignment_df, equity_tiebreaker)
    assignment_df['iteration'] = iteration
    assignment_df['method'] = equity_tiebreaker
    
    return assignment_df[['iteration', diversity_category_col, 'focal', 'cutoff_tiebreaker', 'rank', 'method', focal_block, tiebreaker_status]]
    # return assignment_df[['iteration', 'focal', 'tiebreaker', 'rank', 'method']]
    

In [None]:
rank_results_df = []
for equity_tiebreaker in equity_tiebreaker_list:
    for iteration in range(num_iterations):
        filename = filename_template.format(equity_tiebreaker, iteration)
        # filename = filename_template.format(iteration)
        assignment_df = pd.read_csv(assignments_dir+filename).set_index('studentno')
        # assignment_df = pd.read_csv("~/local-runs/Assignments/Assignment_CTIP1_round_merged123_policyCon1-noRestrict-LengthByEthn_tiesSTB_prefExtend0_iteration0.csv").set_index('studentno')
        rank_results_df += [get_rank_iteration(assignment_df, equity_tiebreaker).reset_index()]

rank_results_df = pd.concat(rank_results_df, axis=0)
rank_results_df.head()

In [None]:
from src.d00_utils.utils import get_group_value

def get_improvement_over_none(df, equity_tiebreaker_list):
    df = df.groupby(['method', 'studentno']).agg({'rank':'mean', tiebreaker_status: get_group_value})
    df_none = df.loc['none']
    df['change'] = np.nan
    for equity_tiebreaker in equity_tiebreaker_list:
        if equity_tiebreaker == 'none':
            pass
        df.loc[equity_tiebreaker, 'change'] = (df.loc[equity_tiebreaker]['rank'] - df_none['rank']).values
    return df.reset_index()

def get_improvement_tp(df, equity_tiebreaker_list):
    df = df.groupby(['method', 'studentno']).agg({'rank':'mean', tiebreaker_status: get_group_value})
    df_none = df.loc['none']
    new_rows = []
    for equity_tiebreaker in equity_tiebreaker_list:
        if equity_tiebreaker == 'none':
            pass
        df_eqtb = df.loc[equity_tiebreaker]
        mask = df_eqtb[tiebreaker_status] == "TP"
        method_rows = df_eqtb.loc[mask, ['rank']].copy()
        method_rows['method'] = equity_tiebreaker
        method_rows['label'] = "with tiebreaker"
        none_rows = df_none.loc[method_rows.index, ['rank']].copy()
        none_rows['method'] = equity_tiebreaker
        none_rows['label'] = "without tiebreaker"
        
        new_rows += [method_rows.reset_index()] + [none_rows.reset_index()]
    return pd.concat(new_rows, axis=0)
        

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", palette="pastel")

# for equity_tiebreaker in equity_tiebreaker_list:
    # display(HTML("<h3>Tiebreaker: %s</h3>" % equity_tiebreaker))
    # mask = rank_results_df['method'] == equity_tiebreaker
df_change = get_improvement_over_none(rank_results_df, equity_tiebreaker_list)
fig, ax = plt.subplots(figsize=(6.8,5.2))
sns.boxplot(ax=ax, x="method", y="change",
            hue=tiebreaker_status,
            data=df_change,
            showfliers = False)
sns.despine(offset=10, trim=False)
plt.legend(bbox_to_anchor=(.95, 1), loc=2, borderaxespad=0., title='Status')
plt.savefig('outputs/boxplot_simulations_change.png')
plt.show()

display(df_change.groupby(['method', tiebreaker_status])['change'].agg(['mean', 'median', 'count', 'std']))

In [None]:
sns.set_theme(style="ticks", palette="pastel")

# for equity_tiebreaker in equity_tiebreaker_list:
    # display(HTML("<h3>Tiebreaker: %s</h3>" % equity_tiebreaker))
    # mask = rank_results_df['method'] == equity_tiebreaker
df_tp = get_improvement_tp(rank_results_df, equity_tiebreaker_list)
fig, ax = plt.subplots(figsize=(6.8,5.2))
sns.boxplot(ax=ax, x="method", y="rank",
            hue="label",
            data=df_tp,
            showfliers = False)
sns.despine(offset=10, trim=False)
plt.legend(bbox_to_anchor=(.95, 1), loc=2, borderaxespad=0., title='Status')
plt.savefig('outputs/boxplot_simulations_tp.png')
plt.show()

display(df_tp.groupby(['method', 'label'])['rank'].agg(['mean', 'median', 'count', 'std']))

In [None]:
# ax = sns.histplot(x="rank", hue="focal", palette=["m", "g"],
#                   data=rank_results_df.groupby(['method', 'focal', 'studentno']).mean().loc['none'].reset_index())

In [None]:
x_axis = tiebreaker_status
hue = "method"

ax = sns.histplot(x=x_axis, hue=hue, data=rank_results_df, multiple="dodge", shrink=.8, stat="probability", common_norm=False)
plt.savefig('outputs/tiebreaker_distribution_prob.png')
plt.show()

ax = sns.histplot(x=x_axis, hue=hue, data=rank_results_df, multiple="dodge", shrink=.8, common_norm=False)
plt.savefig('outputs/tiebreaker_distribution_count.png')
plt.show()

In [None]:
x_axis = "cutoff_tiebreaker"
hue = "focal"

for equity_tiebreaker in equity_tiebreaker_list:
    mask = rank_results_df['method'] == equity_tiebreaker
    display(HTML("<h3>Tiebreaker: %s</h3>" % equity_tiebreaker))
    ax = sns.histplot(x=x_axis, hue=hue, data=rank_results_df.loc[mask], multiple="dodge", shrink=.8, stat="probability", common_norm=False)
    plt.savefig('outputs/tiebreaker_distribution_%s.png' % equity_tiebreaker)
    plt.show()

# Regression Analysis

In [None]:
import statsmodels.api as sm

for equity_tiebreaker in equity_tiebreaker_list:
    mask = rank_results_df['method'] == equity_tiebreaker
    df = rank_results_df.loc[mask]
    df = pd.concat([df, pd.get_dummies(df[tiebreaker_status])], axis=1).groupby('studentno').mean()
    df = df.groupby('studentno').mean()
    # print(df)
    y = df['rank'].copy()
    x = df.copy().drop(columns=['iteration','rank', focal_block, 'focal'])
    # x = df[['focal']].copy()
    # x['const'] = 1.
    # corr = pd.DataFrame(x.corr(), columns=x.columns, index=x.columns)
    # mask = np.zeros_like(corr)
    # mask[np.triu_indices_from(mask)] = True
    # with sns.axes_style("white"):
    #     f, ax = plt.subplots(figsize=(7, 5))
    #     ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True)
    # plt.show()
    model = sm.OLS(y,x)
    results = model.fit()
    display(HTML("<h3>Tiebreaker: %s</h3>" % equity_tiebreaker))
    display(results.summary())

In [None]:
def top3rank(x):
    return np.nanmean(x <= 3)
def countnan(x):
    return np.isnan(x).sum()

top3rank_pct = rank_results_df.groupby(['method', 'focal', 'iteration']).agg({'rank': top3rank})
top3rank_pct.groupby(['method', 'focal']).agg(['mean', 'std'])

# Save Assigment Summary

In [None]:
summary_df = summary_dict[equity_tiebreaker]
with pd.ExcelWriter("outputs/%s.xlsx" % equity_tiebreaker) as writer:  
    summary_df.to_excel(writer, sheet_name='raw')
    summary_df.groupby(group_columns).mean().to_excel(writer,
                                                      sheet_name='mean')
    summary_df.groupby(group_columns).std().to_excel(writer,
                                                      sheet_name='std')