In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

import sys
sys.path.append('../')


In [None]:
from src.d01_data.block_data_api import BlockDataApi
from src.d01_data.student_data_api import StudentDataApi, _block_features, _census_block_column, \
_diversity_index_features

geoid_name = 'geoid'

# Process data
Load data indexed by the `census_block`/`Block`/`Geoid10`
1. Load FRL data
2. Load Block demographics
3. Load Block demographics computed from student data (fill in before computing the demographics from the block data)

## FLR data
This data should be indexed by the column `Geoid10` as type `int64`.

We convert the `group` column into a more coherent index. In the original data there group ids is a integer from `1` to `353` for the blocks that are grouped together and the GEOID for the blocks that stand alone. For some reason the blocks that are grouped together only have `327` (not `353`) unique group indexes. Because of this, the max value of the new index is `3311` instead of `3285` (the actual length of the vector of unique group indexes).

In [None]:
block_data_api = BlockDataApi()

frl_df = block_data_api.get_data(frl=True, user="juan").set_index('Geoid10')
frl_df.index.name = geoid_name
frl_df.columns = ['group', 'n', 'nFRL', 'nAALPI', 'nBoth']
frl_df['pctFRL'] = frl_df['nFRL'] / frl_df['n']
frl_df['pctAALPI'] = frl_df['nAALPI'] / frl_df['n']
frl_df['pctBoth'] = frl_df['nBoth'] / frl_df['n']

# we want to find the blocks that share a group index
mask = frl_df['group'] < 1000
last_group_index = frl_df.loc[mask, 'group'].max()
# then we generate a new set of group indexes for the standalone blocks that is more coherent 
# with the indexes of the grouped blocks
num_of_new_indexes = np.sum(~mask)
new_group_index = np.arange(num_of_new_indexes) + 1 + last_group_index

frl_df.at[~mask, 'group'] = new_group_index
frl_df.tail()

## Block Demographics

This data should be indexed by the column `Block` as type `int64`.

In [None]:
demo_df = block_data_api.get_data().set_index('Block')['BlockGroup'].dropna()
demo_df.index.name = geoid_name
print(demo_df.shape)
print(demo_df.head())

## Student Demographics

This data should be indexed by the column `census_block` as type `int64`.

In [None]:
periods_list = ["1415", "1516", "1617", "1718", "1819", "1920"]
student_data_api = StudentDataApi()

df_students = student_data_api.get_data(periods_list)
mask = df_students[_census_block_column] == 'NaN'
df_students.drop(df_students.index[mask], inplace=True)
df_students[geoid_name]=df_students['census_block'].astype('int64')

In [None]:
def get_group_value(x):
    return x.iloc[0]

stud_df = df_students.groupby(geoid_name)[_diversity_index_features].agg(get_group_value)
print(stud_df.shape)
stud_df.head()

In [None]:
frl_df.loc[60750101001017]

## Join data frames

In [None]:
df = pd.concat([demo_df.to_frame(), stud_df.reindex(demo_df.index), frl_df.reindex(demo_df.index)],
               axis=1,
               ignore_index=False)
df.head()

# Creat map plots

In [None]:
geodata_path = '/share/data/school_choice/dssg/census2010/'
file_name = 'geo_export_e77bce0b-6556-4358-b36b-36cfcf826a3c'
data_types = ['.shp', '.dbf', '.prj', '.shx']

sfusd_map = gpd.read_file(geodata_path + file_name + data_types[0])
sfusd_map[geoid_name] = sfusd_map['geoid10'].astype('int64')
sfusd_map.set_index(geoid_name, inplace=True)

In [None]:
pct_cols = ['pctFRL', 'pctAALPI', 'pctBoth']
sfusd_map_df = pd.concat([sfusd_map.reindex(df.index), df[pct_cols]], axis=1, ignore_index=False)

In [None]:
def plot_column(df_map, column, cmap="viridis"):

    fig, ax = plt.subplots(figsize=(30,30))
    
    if "Count" in column:
        cmap = "PRGn"
    elif "%" in column:
        cmap = "YlOrRd"
    
    df_map.plot(column=column, ax=ax, cmap=cmap, 
                         legend=True, legend_kwds={'orientation': "horizontal"},
                         missing_kwds={'color': 'lightgrey'})
    ax.set_title(column, fontsize=50)
    
    plt.show()

In [None]:
plot_column(sfusd_map_df, 'pctFRL')

# Solve Knapsack

# Train Logistic Regression