In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

In [None]:
from src.d01_data.block_data_api import BlockDataApi
from src.d02_intermediate import block_data_cleaning as b_clean
from src.d02_intermediate.classifier_data_api import ClassifierDataApi

# Exploratory Data Analysis of Block Data

Loading the data:

In [None]:
block_data_api = BlockDataApi()
classifier_data_api = ClassifierDataApi()

In [None]:
SFHA_block_df = block_data_api.get_data(sfha=True)
block_df = block_data_api.get_data(sfha=False)
field_descriptions_df = block_data_api.get_fields()
frl_df_raw = block_data_api.get_data(frl=True)
frl_fields = block_data_api.get_fields(frl=True)

In [None]:
full_data = classifier_data_api.get_block_data(pct_frl=True)

## SFHA Block Data

We want to know whether the SFHA dataset is redundant or it provides relevant information. We can see that the information it contains is entirely contained in the main Block dataset, so we can ignore it:

In [None]:
SFHA_fields = list(SFHA_block_df.columns)

In [None]:
SFHA_fields

In [None]:
block_df_new = block_df.filter(['ID','Block','SFHA_Hsng', "SA_Hsng"], axis=1)
SFHA_block_df_new = SFHA_block_df.filter(['ID','Block','SFHA_Hsng', "SA_Hsng"], axis=1)

merge_df = pd.merge(SFHA_block_df_new, block_df_new, on=['ID'], how='inner')
merge_df.head()

Any difference would appear here (can be tried with any other pair of columns):

In [None]:
count = 0
L = list(merge_df["SA_Hsng_x"] == merge_df["SA_Hsng_y"])
for i in range(len(L)):
    if L[i] == False:
        print(i)
        count += 1

We are good! We do not need to use the SFHA dataset!

## Block Data

In [None]:
pd.set_option("max_rows", None)
field_descriptions_df[["Field Name", "Field Description", "Data Source"]]

Grouping the columns in useful "thematic" groups:

1. IDENTIFICATION: Reflects geographic characteristics and id numbers (Census Tracts, area, FIPS, block type)
2. CURRENT: Columns referring to current CTIP assignment and simulations of variations in the exisiting model by the district
3. POPULATION: Demographic information such as population by age and enrollment in schools, also parents educational level and language status
4. ETHNICITY: Information on ethnicity of residents and students
5. ETHNICITY_DETAILED: Breakdown of ethnicity by grade, detailed ethnic group, and year (district)
6. ETHNICITY_DETAILED_GROUP: Above data but grouped for subsequent grades
6. INCOME: Data referrent to income and wealth of block and families
7. TEST SCORES: Academic data on CST and SBAC
8. HOUSING: San Francisco and Federal Hosuing Authority information

To retrieve the dictionary we can use the appropriate method in the block api class

In [None]:
group_dict = block_data_api.get_classification()

In [None]:
group_dict

BIG REMARK: Some columns are empty (or non-informative)!

In [None]:
for col in field_list:
    if len(set(block_df[col].values)) <= 1:
        print(col + ": ", set(block_df[col].values))

Search for a specific description:

In [None]:
name = "NH White students 2006-2010 K-8"
field_descriptions_df.loc[field_descriptions_df["Field Name"] == name, "Field Description"].iloc[0]

Verifying if the field names and block data all match:

In [None]:
block_fields = list(block_df.columns)

In [None]:
field_fields = list(field_descriptions_df["Field Name"].values)

In [None]:
def Diff(li1, li2):
    return list(set(li1) - set(li2))

Diff(block_fields, field_fields)

In [None]:
Diff(field_fields, block_fields)

The FRL column is a mystery, and so is the DATA in the block dataset. All other mismatches have been fixed in the initialization on the class.

## Focal Students Data

In [None]:
focal_columns = ["n", "nFocal", "nAALPI", "nFRL", "nBoth", "pctFocal", "pctAALPI", "pctFRL", "pctBoth"]

focal_data = full_data[focal_columns]
focal_data_map = classifier_data_api.get_map_df_data(cols=focal_columns)

This focal student dataset is extremely big. The total number of focal students amounts to over 60% of the student body. The intersection students are a better dataset to consider, but it is still too large:

In [None]:
s_df = focal_data.sum()

print("Percentage of focal students in SF: {:.2%}".format(s_df["nFocal"]/s_df["n"]))
print("Percentage of intersection focal students in SF: {:.2%}".format(s_df["nBoth"]/s_df["n"]))

Most blocks are very heterogeneous. The median focal student percentage in a block is 50%, whereas the median intersectional focal students per block is 3%. This means the classification will have many false positives.

In [None]:
focal_data.median()

We can view the distribution of percentages per bloc as histograms to understand the trade-offs:

In [None]:
ax = focal_data.hist(column=["pctBoth", "pctFocal"], grid=False, bins=20,
                     layout=(2,1), figsize=(20,30),
                     ylabelsize=20, xlabelsize=20,
                     sharey=True)

We can try to remove the zero and one blocks:

In [None]:
heterogeneous_focal_data = focal_data[focal_data["nBoth"] > 0]
heterogeneous_focal_data.median()["pctBoth"]

In [None]:
ax = heterogeneous_focal_data.hist(column=["pctBoth"], grid=False, bins=20,
                                   figsize=(20,15),
                                   ylabelsize=20, xlabelsize=20,
                                   sharey=True)

In [None]:
cda.