In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')

from src.d01_data.student_data_api import StudentDataApi, _census_block_column
from src.d01_data.block_data_api import BlockDataApi
from src.d01_data.abstract_data_api import AbstractDataApi

# Using the student data to observe block diversity features over time

In [None]:
periods_list = ["1415", "1516", "1617", "1718", "1819", "1920"]
student_data_api = StudentDataApi()

df_block = student_data_api.get_data_by_block(periods_list=periods_list)

The first thing we wanted to observe is how the CTIP1 varied across time. CPIP1 is the "geo proxy" currently used by SFUSD to identify and prioritize students from underserved communities. Below we check the percentage of census blocks with nonzero CTIP1 for each year of available student data and how this CTIP1 is distributed.

In [None]:
mask = df_block['ctip1'] > 0

df_block['positive_ctip1'] = 0
df_block.at[mask, 'positive_ctip1'] = 1

df_block.groupby('year').agg({'positive_ctip1': ['mean']})

In [None]:
def q25(x):
    return x.quantile(0.25)
def q50(x):
    return x.quantile(0.5)
def q75(x):
    return x.quantile(0.75)

agg_funs = ['count', 'mean', 'std', 'min', q25, q50, q75, 'max']
df_block.loc[mask].groupby('year').agg({'ctip1': agg_funs})

What can I observe?

We can also aggregate by the census blockgroup.

In [None]:
agg_funs = ['count', 'mean', 'std', 'min', 'max']
block_agg = df_block.loc[mask].groupby(_census_block_column).agg({'ctip1': agg_funs})

block_agg.sort_values(('ctip1', 'count'))

What can I observe?

We can also sample some blockgroups and see some data aggregated by block. I should note that the diversity features are defined by block, so there is not really any aggregation.

In [None]:
blockgroup_index = df_block.index.get_level_values(_census_block_column).unique()

np.random.seed(101)
blockgroup_ids = np.random.choice(blockgroup_index, size=5)

df_block.loc[(blockgroup_ids, slice(None)), ['count', 'ctip1']]

In [None]:
df_student = student_data_api.get_data(periods_list=periods_list)
print(df_student.columns)
df_student = df_student.set_index(['census_block', 'year', 'studentno']).sort_index()

In [None]:
from src.d01_data.student_data_api import _block_features
df_student.loc['60750476001007'][['grade', 'ctip1']].groupby(['grade','year']).mean()

In [None]:
studentno = 98412005

df_student.loc[(slice(None), slice(None), studentno), ['grade'] + _block_features]