In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

In [None]:
from src.d01_data.block_data_api import BlockDataApi
from src.d02_intermediate import block_data_cleaning as b_clean

# Exploratory Data Analysis of Block Data

Loading the data:

In [None]:
block_data_api = BlockDataApi()

In [None]:
SFHA_block_df = block_data_api.get_data(sfha=True)
block_df = block_data_api.get_data(sfha=False)
field_descriptions_df = block_data_api.get_fields()
frl_df_raw = block_data_api.get_data(frl=True, user="gabriel")
frl_fields = block_data_api.get_fields(frl=True, user="gabriel")

## SFHA Block Data

We want to know whether the SFHA dataset is redundant or it provides relevant information. We can see that the information it contains is entirely contained in the main Block dataset, so we can ignore it:

In [None]:
SFHA_fields = list(SFHA_block_df.columns)

In [None]:
SFHA_fields

In [None]:
block_df_new = block_df.filter(['ID','Block','SFHA_Hsng', "SA_Hsng"], axis=1)
SFHA_block_df_new = SFHA_block_df.filter(['ID','Block','SFHA_Hsng', "SA_Hsng"], axis=1)

merge_df = pd.merge(SFHA_block_df_new, block_df_new, on=['ID'], how='inner')
merge_df.head()

Any difference would appear here (can be tried with any other pair of columns):

In [None]:
count = 0
L = list(merge_df["SA_Hsng_x"] == merge_df["SA_Hsng_y"])
for i in range(len(L)):
    if L[i] == False:
        print(i)
        count += 1

We are good! We do not need to use the SFHA dataset!

## Block Data

In [None]:
pd.set_option("max_rows", None)
field_descriptions_df[["Field Name", "Field Description", "Data Source"]]

Grouping the columns in useful "thematic" groups:

1. IDENTIFICATION: Reflects geographic characteristics and id numbers (Census Tracts, area, FIPS, block type)
2. CURRENT: Columns referring to current CTIP assignment and simulations of variations in the exisiting model by the district
3. POPULATION: Demographic information such as population by age and enrollment in schools, also parents educational level and language status
4. ETHNICITY: Information on ethnicity of residents and students
5. ETHNICITY_DETAILED: Breakdown of ethnicity by grade, detailed ethnic group, and year (district)
6. ETHNICITY_DETAILED_GROUP: Above data but grouped for subsequent grades
6. INCOME: Data referrent to income and wealth of block and families
7. TEST SCORES: Academic data on CST and SBAC
8. HOUSING: San Francisco and Federal Hosuing Authority information

To retrieve the dictionary we can use the appropriate method in the block api class

In [None]:
group_dict = block_data_api.get_classification()

In [None]:
group_dict

BIG REMARK: Some columns are empty (or non-informative)!

In [None]:
for col in field_list:
    if len(set(block_df[col].values)) <= 1:
        print(col + ": ", set(block_df[col].values))

Search for a specific description:

In [None]:
name = "NH White students 2006-2010 K-8"
field_descriptions_df.loc[field_descriptions_df["Field Name"] == name, "Field Description"].iloc[0]

Verifying if the field names and block data all match:

In [None]:
block_fields = list(block_df.columns)

In [None]:
field_fields = list(field_descriptions_df["Field Name"].values)

In [None]:
def Diff(li1, li2):
    return list(set(li1) - set(li2))

Diff(block_fields, field_fields)

In [None]:
Diff(field_fields, block_fields)

The FRL column is a mystery, and so is the DATA in the block dataset. All other mismatches have been fixed in the initialization on the class.

## Focal Students Data

In [None]:
geodata_path = '/share/data/school_choice/dssg/census2010/'
file_name = 'geo_export_e77bce0b-6556-4358-b36b-36cfcf826a3c'
data_types = ['.shp', '.dbf', '.prj', '.shx']

sfusd_map = gpd.read_file(geodata_path + file_name + data_types[0])
sfusd_map = sfusd_map.set_index('geoid10')

mask = sfusd_map['intptlon10'] < '-122.8'
mask &= sfusd_map['awater10'] == 0.0
# get rid of water

Let's add the columns referring to focal students:

In [None]:
frl_df = frl_df_raw.drop("Geoid Group", axis=1)
frl_df['Geoid10'] = frl_df_raw['Geoid10'].apply(lambda x: '0%i' % int(x))
sfusd_map_focal = sfusd_map.merge(frl_df, left_on='geoid10', right_on='Geoid10')

We might also want the percentages:

In [None]:
sfusd_map_focal["FRL students (%)"] = sfusd_map_focal["4YR AVG FRL Count"]/sfusd_map_focal["4YR AVG Student Count"]
sfusd_map_focal["AALPI students (%)"] = sfusd_map_focal["4YR AVG Eth Flag Count"]/sfusd_map_focal["4YR AVG Student Count"]
sfusd_map_focal["Combo students (%)"] = sfusd_map_focal["4YR AVG Combo Flag Count"]/sfusd_map_focal["4YR AVG Student Count"]

Simple plotting function. Colormap choice changes significantly when we deal with absolute counts (a diverging cmap such as PRGn is better for absolute counts, a sequential map such as YlOrRd is better for percentages)

In [None]:
def plot_column(column, cmap="viridis"):

    fig, ax = plt.subplots(figsize=(30,30))
    
    if "Count" in column:
        cmap = "PRGn"
    elif "%" in column:
        cmap = "YlOrRd"
    
    sfusd_map_focal.plot(column=column, ax=ax, cmap=cmap, 
                         legend=True, legend_kwds={'orientation': "horizontal"},
                         missing_kwds={'color': 'lightgrey'})
    ax.set_title(column, fontsize=50)
    
    plt.show()

In [None]:
column = "FRL students (%)"
plot_column(column)

In [None]:
column = "AALPI students (%)"
plot_column(column)

In [None]:
column = "Combo students (%)"
plot_column(column)