In [4]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../')

# Parameters

In [28]:
np.random.seed(1992)
frl_key='tk5'
period = "1819"
recalculate = False
output_path = "/share/data/school_choice_equity/simulator_data/student/"

# Load Student Data

In [16]:
from src.d01_data.student_data_api import StudentDataApi

sda = StudentDataApi()
student_df = sda.get_data(periods_list=[period]).set_index('studentno')
mask = student_df['grade'] == 'KG'
student_df = student_df.loc[mask]
student_df.head()

Unnamed: 0_level_0,r1_ranked_idschool,r1_listed_ranks,r1_programs,grade,r1_randomnumber,randomnumber,requestprogramdesignation,latitude,longitude,r2_ranked_idschool,...,Academic Score,Nhood SES Score,FRL Score,sibling,currentlpsibling,currentlp,msf,aaprek,aa,year
studentno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10210002,[420],[1],['GE'],KG,[0.446027141579977],0.348868,0.0,37.747093,-122.429507,,...,0.244898,0.342849,0.031694,[420],,,,[420],[420],2019
10210005,"[435, 670]","[1, 2]","['GE', 'GE']",KG,"[0.9110051502117931, 0.111640986769763]",0.681393,0.0,37.718266,-122.480908,,...,0.365385,0.766113,0.584975,,,,,,[670],2019
10210006,"[729, 537, 680, 876, 729]","[1, 2, 3, 4, 5]","['SE', 'SE', 'SE', 'CE', 'GE']",KG,"[0.41175899240252206, 0.33197490658210305, 0.0...",0.694158,0.0,37.714647,-122.403722,,...,0.380597,0.640319,0.660631,[729],,,,,,2019
10210007,"[786, 569, 735, 435, 549, 413, 589, 479, 718, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","['JE', 'GE', 'GE', 'GE', 'GE', 'GE', 'GE', 'GE...",KG,"[0.9240686684326509, 0.591082980668961, 0.3187...",0.289696,0.0,37.780102,-122.44289,,...,0.022388,0.504279,0.263899,,,,,,[735],2019
10210010,"[796, 420, 505, 722, 589, 618, 876, 718, 493, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","['GE', 'SE', 'GE', 'GE', 'GE', 'SE', 'CE', 'GE...",KG,"[0.0774816984940387, 0.826669827283934, 0.8127...",0.200155,0.0,37.75847,-122.442804,,...,0.0,0.288537,0.0,,,,,,[505],2019


# Generate Individual Focal Students

The individual focal students for each block group are going to be generated at random. We are going to use the block groupes defined by SFUSD. The available data allows us to get rough estimates of the following probabilities for each block group:

- Probability a student is AALPI: $P(AALPI)$
- Probability a student is FRL: $P(FRL)$
- Probability a student is both AALPI and FRL: $P(AALPI \cap FRL)$
- Probability a student is FRL conditional on him being AALPI: $$P(FRL \mid AALPI) = \frac{P(AALPI \cap FRL)}{P(AALPI)}$$
- Probability a student is FRL conditional on him not being AALPI: $$P(FRL \mid \overline{AALPI}) = \frac{P(FRL)-P(AALPI \cap FRL)}{1-P(AALPI)}$$

Since some blocks have a small number of students we are going to use the mean of the posterior probability of a Bernoulli distribution with a uniform prior as probability estimates. Let $\theta$ be the parameter of the Bernoulli disttribution, $m$ the counts of interest and $n$ the total counts, we can get our probability estimates as: 

$$\hat{\theta} = \mathbb{E}\left[\theta\mid m,n\right]=\frac{m+1}{n+2}$$

For example, we can estimate the probability of a student being AALPI as

$$P(AALPI)=\frac{\textit{counts of AALPI}+1}{\textit{total counts}+2}$$

In [21]:
from src.d00_utils.utils import add_bayesian_bernoulli
from src.d02_intermediate.classifier_data_api import ClassifierDataApi
from src.d00_utils.utils import get_label

def frl_cond_aalpi(row):
    return row['probBoth'] / row['probAALPI']
def frl_cond_naalpi(row):
    return (row['probFRL'] - row['probBoth']) / (1. - row['probAALPI'])
    
def isFocal(row, frl_df):
    geoid = row['census_block']
    if np.isnan(geoid):
        return 0
    geoid = int(geoid)
    u = np.random.random()
    if geoid in frl_df.index:
        probs = frl_df.loc[geoid]
        if row['AALPI'] == 1:
            if u <= probs['probCondAALPI']:
                return 1
            else:
                return 0
        else:
            if u <= probs['probCondNAALPI']:
                return 1
            else:
                return 0
    else:
        return 0

In [15]:
def add_focal_students(student_df):
    
    cda = ClassifierDataApi()
    frl_df = cda.get_frl_data(frl_key=frl_key)
    frl_df = add_bayesian_bernoulli(frl_df)

    frl_df['probCondAALPI'] = frl_df.apply(lambda row: frl_cond_aalpi(row), axis=1)
    frl_df['probCondNAALPI'] = frl_df.apply(lambda row: frl_cond_naalpi(row), axis=1)

    aalpi_ethnicity_list =  ['Black or African American','Hispanic/Latino','Pacific Islander']

    student_df['AALPI'] = student_df['resolved_ethnicity'].isin(aalpi_ethnicity_list).astype('int64')
    student_df['focal'] = student_df.apply(lambda row: isFocal(row, frl_df), axis=1)

# Add Equity Tiebreaker Column

In [22]:
def add_equity_tiebreaker(student_df, model, params, tiebreaker):
    solution = model.get_solution_set(params)
    student_df[tiebreaker] = student_df['census_block'].apply(lambda x: get_label(x, solution))

# Update Student Data for Simulations

In [41]:
def get_file_name(period):
    return "student_%s.csv" % period

def check_consistency(student_out, student_df):
    test = student_out['focal'] != student_df['focal']
    if test.any():
        raise Exception("Previous aved student data is not consisten with student data being generated")
        
def update_student_data(student_df, output_path, period, tiebreaker):
    global recalculate
    fname = get_file_name(period)
    if os.path.isfile(output_path + fname):
        print("Loading student data from:\n %s" % (output_path + fname))
        student_out = pd.read_csv(output_path + fname).set_index('studentno')
        check_consistency(student_out, student_df)
        if tiebreaker not in student_out.columns or recalculate:
            print("Updateing %s in student data..." % tiebreaker)
            student_out[tiebreaker] = student_df[tiebreaker]
            recalculate = False
        else:
            raise Exception("Tiebreaker already exists!")
            
    else:
        print("Creating student data:\n %s" % (output_path + fname))
        student_out = student_df.copy()
        
    return student_out
        
def save_student_data(student_out, output_path, period):
    fname = get_file_name(period)
    print("Saving to:\n  %s" % (output_path + fname))
    student_out.reset_index().to_csv(output_path + fname)

## Compute new columns

In [34]:
from src.d04_modeling.knapsack_classifier import KnapsackClassifier

add_focal_students(student_df)

tiebreaker = 'knapsack008'
fpr = 0.08
model = KnapsackClassifier(positive_group='nFocal', load=True,
                           frl_key=frl_key, run_name=frl_key+".pkl")

add_equity_tiebreaker(student_df, model, params=fpr, tiebreaker=tiebreaker)
student_df[tiebreaker].mean()

0.27302953256405244

In [43]:
student_out = update_student_data(student_df, output_path, period, tiebreaker)

Loading student data from:
 /share/data/school_choice_equity/simulator_data/student/student_1819.csv


Exception: Tiebreaker already exists!

In [39]:
save_student_data(student_out, output_path, period)

Saving to:
  /share/data/school_choice_equity/simulator_data/student/student_1819.csv
