In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')

# Background

The Stanford research team constructed a new diversity index defined by district staff. The diversity index is defined for each census block group, and is an average of 4 scores: a FRL score, a neighborhood SES score, an academic score, and AALPI score (African American, Latinx and Pacific Islander)

In [15]:
from src.d01_data.student_data_api import StudentDataApi, _block_features, _census_block_column

student_data_api = StudentDataApi()
df_students = student_data_api.get_data().set_index('studentno')
np.random.seed(1992)
studentno = np.random.choice(df_students.index)
census_block = df_students.loc[studentno, _census_block_column]
print(census_block)
df_students.loc[studentno, _block_features]

60750352022005


freelunch_prob       0.404494
reducedlunch_prob    0.067416
ctip1                     0.0
HOCidx1              0.293871
HOCidx2               0.21341
HOCidx3              0.331031
AALPI Score          0.078276
Academic Score        0.19382
Nhood SES Score      0.538254
FRL Score            0.365133
Name: 888004210, dtype: object

In [3]:
from src.d01_data.block_data_api import BlockDataApi, _acs_columns
block_data_api = BlockDataApi()
df1 = block_data_api.get_data(sfha=False).set_index('Block')

In [5]:
# df2 = block_data_api.get_data(True).set_index('Block')
# print(df1.loc[int(census_block)].reset_index().to_string())
block_acs_metrics = df1.loc[int(census_block), :].copy()
print(block_acs_metrics.reset_index().to_string())

                                        index    60750352022005
0                                          ID            201546
1                                        AREA          0.007547
2                                        DATA            207295
3                                  Block Type  land - populated
4                                  BlockGroup       60750352022
5                                       Tract        6075035202
6                                      County              6075
7                                       Place            667000
8                                  UnifSchool            634410
9                                        ZIP5               NaN
10                   SF Analysis Neighborhood   Sunset/Parkside
11                               Current ESAA               Key
12                                 Scenario 1               Key
13                                 Scenario 2               Key
14                       CTIP_2013 assig

In [22]:
print(block_acs_metrics.index[16:25].to_list())

['2010 total population count', '2010 Hispanic', '2010 non-Hisp African American', '2010 non-Hisp Amer Indian Alaska Native', '2010 non-Hisp Asian', '2010 non-Hisp Pacific Islander', '2010 non-Hisp White', '2010 non-Hisp Other', '2010 non-Hispanic Mixed Race']


## FRL score

The FRL score measures the percentage of students in block $b\in B$ eligible for free or reduced
price lunch ($FRL\%(b)$) as given by SFUSD Student Nutrition Services, normalized by the maximum
percentage over all blocks

$$FRLScore(b) = \frac{FRL\%(b)}{\underset{b\in B}{\max} FRL\%(b)}$$


In [6]:
# Where can we find this raw data?

# FRLxEthncity SY16-SY19 - FRL data by block averaged over 4 years, broken down by ethnicity 
# (appears to be racex categorization, not resolved_ethnicity)

## SES score

The neighborhood socioeconomic status score ($SESScore(b)$) uses data from the American Community
Survey 5-year estimates 2013-17, including median household income in the block ($HHInc(b)$),
poverty level ($Pov\%(b)$), and adult educational attainment ($BachDeg\%(b)$), as measured by the
percentage of residents 25 years of age or older in the block who have a bachelors degree.

We can define the SES Index

$$SESMetric(b) = 1 - \frac{HHInc(b)}{\underset{b'\in B}{\max} HHInc(b')} + \frac{Pov\%(b)}{\underset{b'\in B}{\max} Pov\%(b')} + 1 - \frac{BachDeg\%(b)}{\underset{b'\in B}{\max} BachDeg\%(b')}$$

$$SESScore(b) = \frac{SESMetric(b)}{\underset{b'\in B}{\max} SESMetric(b')}$$


In [12]:
hhinc_col = 'ACS 2013-17 est median HH income'
pov_col = 'ACS 2013-17 est% HH below poverty lvl'
bachdeg_col = 'ACS 2013-17 % aged 25+ with Bachelors'

ses_cols = [hhinc_col, pov_col, bachdeg_col]

block_data_api.get_fields_for_columns(columns=ses_cols)

Unnamed: 0_level_0,Field Description,Data Source,Commenta/Caveats,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
Field Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ACS 2013-17 est median HH income,ACS 2013-17 est median household income $ last...,American Community Survey 5-year estimates 201...,"These are Census tract-based estimates, and of...",,,,
ACS 2013-17 est% HH below poverty lvl,ACS 2013-17 est % of households below poverty ...,American Community Survey 5-year estimates 201...,"These are Census tract-based estimates, and of...",,,,
ACS 2013-17 % aged 25+ with Bachelors,ACS 2013-17 est % share of pop aged 25+ with b...,American Community Survey 5-year estimates 201...,"These are Census tract-based estimates, and of...",,,,


In [None]:
ses_factors_max = df1[ses_cols].max()
block_ses = df1[ses_cols].copy() / ses_factors_max.values[np.newaxis, :]
block_ses.columns = ['hhinc', 'pov', 'bachdeg']

block_ses['metric'] = 1 - block_ses['hhinc'] + block_ses['pov'] + 1 - block_ses['bachdeg']
block_ses['score'] = block_ses['metric'] / block_ses['metric'].max()

expected = df_students.loc[studentno, 'Nhood SES Score']
result = block_ses.loc[int(census_block), 'score']
assert abs(expected - result) < 1e-6, "%.6f <> %.6f" % (expected, result)

## Academic Score

The block group academic score ($AcademicScore(b)$) measures the percentage of students with
level 1 test scores,19 normalized by the maximum percentage over all blocks

$$AcademicScore(b) = \frac{L1\%(b)}{\underset{b'\in B}{\max} L1\%(b')}$$


In [10]:
academic_cols = ['num of SBAC L1 scores 4-9 2015-18',
 'num of SBAC L2 scores 4-9 2015-18',
 'num of SBAC L3 scores 4-9 2015-18',
 'num of SBAC L4 scores 4-9 2015-18',
 'ttl num 4-9 test takers 2015-18']

l1_col = 'num of SBAC L1 scores 4-9 2015-18'
total_col = 'ttl num 4-9 test takers 2015-18'

block_data_api.get_fields_for_columns(columns=academic_cols)

Unnamed: 0_level_0,Field Description,Data Source,Commenta/Caveats,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
Field Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
num of SBAC L1 scores 4-9 2015-18,Number of SBAC Level 1 scores grades 4-9 2015-...,LGDR computed using student address database,,,,,
num of SBAC L2 scores 4-9 2015-18,Number of SBAC Level 2 scores grades 4-9 2015-...,LGDR computed using student address database,,,,,
num of SBAC L3 scores 4-9 2015-18,Number of SBAC Level 3 scores grades 4-9 2015-...,LGDR computed using student address database,,,,,
num of SBAC L4 scores 4-9 2015-18,Number of SBAC Level 4 scores grades 4-9 2015-...,LGDR computed using student address database,,,,,
ttl num 4-9 test takers 2015-18,Total number of 4th-9th grade SBAC test takers...,LGDR computed using student address database,,,,,


In [11]:
block_academics = df1[academic_cols].copy() / df1[total_col].values[:, np.newaxis]
l1_score_max = block_academics[l1_col].max()
block_academics['score'] = block_academics[l1_col] / l1_score_max
block_academics.loc[int(census_block), 'score']

expected = df_students.loc[studentno, 'Academic Score']
result = block_academics.loc[int(census_block), 'score']
assert abs(expected - result) < 1e-6, "%.6f <> %.6f" % (expected, result)

AssertionError: 0.193820 <> 0.131579

## AALPI Score

The AALPI score measures the percentage of students from the historically underserved ethnic
groups of African American, Latino, and Pacific Islander students,

$$AALPIScore(b) = \frac{AALPI\%(b)}{\underset{b'\in B}{\max} AALPI\%(b')}$$


In [24]:
aalpi_cols = ['2010 Hispanic', '2010 non-Hisp African American',
              '2010 non-Hisp Amer Indian Alaska Native', '2010 non-Hisp Asian',
              '2010 non-Hisp Pacific Islander', '2010 non-Hisp White', '2010 non-Hisp Other',
              '2010 non-Hispanic Mixed Race']

total_col = '2010 total population count'

block_data_api.get_fields_for_columns(columns=aalpi_cols)

Unnamed: 0_level_0,Field Description,Data Source,Commenta/Caveats,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
Field Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010 Hispanic,Census 2010 pop count; Hispanic,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp African American,Census 2010 pop count; non-Hispanic African A...,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp Amer Indian Alaska Native,Census 2010 pop count; non-Hispanic Amer Indi...,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp Asian,Census 2010 pop count; non-Hispanic Asian,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp Pacific Islander,Census 2010 pop count; non-Hispanic Pacific I...,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp White,Census 2010 pop count; non-Hispanic White,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hisp Other,Census 2010 pop count; non-Hispanic Other,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,
2010 non-Hispanic Mixed Race,Census 2010 pop count; non-Hispanic Mixed Race,from Census 2010 PL 94-171 database,these are block-level counts (not estimates),,,,


In [25]:
block_aalpi = df1[aalpi_cols + [total_col]].copy()

# block_aalpi['total_check'] = block_aalpi[aalpi_cols].sum(axis=1)

block_aalpi

Unnamed: 0_level_0,2010 Hispanic,2010 non-Hisp African American,2010 non-Hisp Amer Indian Alaska Native,2010 non-Hisp Asian,2010 non-Hisp Pacific Islander,2010 non-Hisp White,2010 non-Hisp Other,2010 non-Hispanic Mixed Race,2010 total population count,total_check
Block,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60750604001012,0,0,0,0,0,0,0,0,0,0
60750604001011,152,67,4,343,3,533,25,19,1146,1146
60750179021024,0,0,0,0,0,0,0,0,0,0
60750179021028,0,0,0,0,0,0,0,0,0,0
60750179021013,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
60750604001007,0,0,0,0,0,0,0,0,0,0
60750604001015,0,0,0,0,0,0,0,0,0,0
60750604001018,0,0,0,0,0,0,0,0,0,0
60750604001032,0,0,0,0,0,0,0,0,0,0


## SES Index

The SES index uses only the socioeconomic and free and reduced price lunch
components of the diversity index, and is de ned as follows

$$SESIndex(b) = \frac{FRLScore(b) + SESMetric(b)}{4}$$
