In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

from src.d01_data.block_data_api import BlockDataApi
from src.d02_intermediate import block_data_cleaning as b_clean

# Understanding the Correlation Betweeen Different Block Data

First we get the data and the classification:

In [None]:
block_data_api = BlockDataApi()
block_df = block_data_api.get_data(sfha=False)
field_descriptions_df = block_data_api.get_fields()
group_dict = block_data_api.get_classification()

## 1. ACS vs. District Student Counts

The district provides us counts of students per school year in a block, as well as a detailement of their ethnicities. Some other sources (ACS, Census) also provide estimates of these measurements. Are those datasets in agreement?

In [None]:
group_dict["DEMOGRAPHIC"]

Let's focus in the following columns, which should have a high correlation (despite the time difference)

In [None]:
L = ['ACS 2013-17 est% aged 5-14 public sch', 'number of K8 residents fall 2017', 'K8stu1517']
block_df_filt = block_df[L]
for name in L:
    print(name + ": " + field_descriptions_df.loc[field_descriptions_df["Field Name"] == name, "Field Description"].iloc[0], "\n")

These columns do not correlate per se. The two district columns do, but not with the ACS estimates:

In [None]:
M = block_df_filt.to_numpy()
print(np.corrcoef(np.transpose(M)))

We can see that the discrepancies are very big in some blocks (zero for district, 50% for ACS estimate):

In [None]:
block_df_filt

We can try to correct the ACS estimate to an absolute estimate. For that we can use the count of census minor (<18 yo) population in 2010, which is the closes value we have. Several issues there (different year, different age group), but we should be able to understand whether the scales are the problem:

In [None]:
block_df_filt2 = block_df_filt.copy()
block_df_filt2["2010 pop less than 18 years old"] = block_df["2010 pop less than 18 years old"]
block_df_filt2["ACS absolute estimate"] = block_df["2010 pop less than 18 years old"] * block_df_filt["ACS 2013-17 est% aged 5-14 public sch"]

In [None]:
block_df_filt2

In [None]:
M = block_df_filt2.to_numpy()
print(np.corrcoef(np.transpose(M)))

Now the district data (second and third columns) correlate highly (around 86%) with the absolute ACS data (last column). Seems like the error was on the units! So we can keep using the ACS data, but we might have to rely on this outdated absolute values.

## 2. Ethnicity

In [None]:
group_dict["ETHNICITY_DETAILED_GROUP"]

First we note that some columns are just empty:

In [None]:
print(set(block_df["African American students 2006-2010 K-8"].values))
print(set(block_df["AfAmer1114"].values))
print(set(block_df["AfAmer1517"].values))
print(set(block_df["NH White students 2006-2010 K-8"].values))

We might have more chances with the detailed ethnicity:

In [None]:
group_dict["ETHNICITY_DETAILED_GROUP"]

In [None]:
print(set(block_df["2013 K-5 Hispanic"].values))

The district provides us with data on student ethnicity. From census, we have data on the total population. Let's evaluate how those numbers correlate. I use the following columns to try to match hispanic ethnicity:

In [None]:
L = ['2010 Hispanic', '2010 pop <18 Hispanic', "2013 K-5 Hispanic", "2013 6-8 Hispanic", "2013 9-12 Hispanic"]
block_df_filt = block_df[L]
for name in L:
    print(name + ": " + field_descriptions_df.loc[field_descriptions_df["Field Name"] == name, "Field Description"].iloc[0], "\n")

In [None]:
block_df_filt = block_df_filt.replace({"--": 0})

In [None]:
M = block_df_filt.to_numpy()
print(np.corrcoef(np.transpose(M)))

High correlation in general, but not the best. Let's see if aggregating we would get better results:

In [None]:
block_df_filt2 = block_df_filt[['2010 Hispanic', '2010 pop <18 Hispanic']].copy()
temp_df = block_df_filt[["2013 K-5 Hispanic", "2013 6-8 Hispanic", "2013 9-12 Hispanic"]].copy()

block_df_filt2["2013 Hispanic Students"] = temp_df.sum(axis = 1, skipna = True)

In [None]:
block_df_filt2

In [None]:
M = block_df_filt2.to_numpy()
print(np.corrcoef(np.transpose(M)))

Improvements from the breakdown. Leads us to believe there may be more students of color (hispanic in this case) going out of the public school system as they grow older. That could either mean they moved out of SF, dropped out of school, or went into the private system.

Overall there is a strong positive correlation between residents and students number, regardless of whether we are using residents <18 yo or total number of residents. But we must be consistent on the scales.

## 3. Columns we plan to use

In [None]:
columns_selected = ['2010 total population count',
                    "AALPI all TK5 stu 2017",
                    "ACS 2013-17 est median HH income",
                    "ACS 2013-17 est% HH below poverty lvl",
                    'ACS 2013-17 est% aged5+ Engl "not well"',
                    "SFHA_ex_Sr",
                    "num of SBAC L1 scores 4-9 2015-18"]

As discussed above, we need to do some pre-processing. We can start by cleaning the data using the provided method:

In [None]:
NEW = b_clean.clean_block_data(block_df)

Now we do the preprocessing with percentages:

In [None]:
NEW2 = NEW.copy()[["AALPI all TK5 stu 2017"]]
NEW2["Median HH income"] = NEW["ACS 2013-17 est median HH income"]
NEW2["Pop. below poverty level"] = NEW["ACS 2013-17 est% HH below poverty lvl"]*NEW["2010 total population count"]
NEW2["Pop. english not well"] = NEW['ACS 2013-17 est% aged5+ Engl "not well"']*NEW["2010 total population count"]
NEW2["SFHA"] = NEW["SFHA_ex_Sr"]
NEW2["SBAC L1"] = NEW["num of SBAC L1 scores 4-9 2015-18"]

In [None]:
NEW2

In [None]:
M = NEW2.to_numpy()
print(np.corrcoef(np.transpose(M)))