In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../')

from src.d01_data.student_data_api import StudentDataApi, _block_features, _census_block_column, \
_diversity_index_features, _studentno

In [None]:
periods_list = ["1415", "1516", "1617", "1718", "1819", "1920"]
student_data_api = StudentDataApi()

df_students = student_data_api.get_data(periods_list)
mask = df_students[_census_block_column] == 'NaN'
df_students.drop(df_students.index[mask], inplace=True)

In [None]:
diversity_index_col = 'Diversity Index'
df_students[diversity_index_col] = df_students[_diversity_index_features].mean(axis=1)

def get_value(x):
    return x.iloc[0]

block_prob = df_students.groupby(_census_block_column)[diversity_index_col].agg(get_value)
block_prob = block_prob / block_prob.sum()

In [None]:
df_students = df_students[[_studentno, diversity_index_col, 'year', _census_block_column] + _block_features]
df_students.head()

Let's assume that there are two types of block $b$: (i) high probability of containing focal students and (ii) low probability of containing focal students. In particular, let's assume that if the `diversity_index` of the block is above the `focal_block_threshold`, then a student coming from that block has the probability `p` of being a focal student. If the block has a low probability of containing focal students, then a student coming from that block has a probability `1-p` of being a focal student.

In [None]:
df_year = df_students[['year', _census_block_column, diversity_index_col]].copy()

gamma = 2.5
def prob_focal_given_block(diversity_index):
    return np.power(diversity_index, gamma)
    
df_year['prob'] = df_students[diversity_index_col].apply(prob_focal_given_block)

df_year = df_year.groupby(['year'])['prob'].agg(['sum', 'count'])
df_year.columns = ['nFocalStudents', 'nTotalStudents']
df_year['pctFocalStudents'] = df_year.apply(lambda row: row['nFocalStudents'] / row['nTotalStudents'], axis=1, raw=False)
df_year

In [None]:
df_students['prob'] = df_students[diversity_index_col].apply(prob_focal_given_block)

np.random.seed(20210704)

df_students.loc[df_students.index, 'focal'] = np.random.binomial(1, p=df_students['prob'])

df_students.groupby('year')['focal'].mean()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

num_bins = 10
hist_data = df_students.groupby(['year', _census_block_column])['focal'].mean().loc[2020]
weights = np.ones(hist_data.shape) / hist_data.count()
n, bins, patches = ax.hist(hist_data, num_bins, weights=weights)

ax.set_xlabel('Fraction of focal students in blockgroup')
ax.set_ylabel('Percent of blockgroups')

fig.tight_layout()
plt.show()

# Plotting distribution in map

In [None]:
import geopandas

geodata_path = '/share/data/school_choice/dssg/census2010/'
file_name = 'geo_export_e77bce0b-6556-4358-b36b-36cfcf826a3c'

data_types = ['.shp', '.dbf', '.prj', '.shx']
sfusd = geopandas.read_file(geodata_path + file_name + '.shp')

In [None]:
sfusd.columns

In [None]:
mask = sfusd['intptlon10'] < '-122.8'
# get rid of water

In [None]:
sfusd.loc[mask].plot()
plt.show()