In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import requests
import plotly.express as px
from urllib.request import urlopen
import json
import plotly.figure_factory as ff

In [2]:
df = pd.read_csv('../StartingWithToday/data/joined_depression_cre_walkability.csv')
    
df.columns = df.columns.str.lower()
    
df.rename(columns={'indicator rate value':'depression_rate','locationname':'geoid_tract_20'},inplace=True)

Describe the data to identify potentially incorrect values.

In [3]:
df[['blw_pov_lvl_pe','depression_rate','no_health_ins_pe','gini_ind_inequality_e',
    'no_veh_pe','hs_grad_pe','broadband_pe','median_natwalkind']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
blw_pov_lvl_pe,396.0,12.42525,10.72242,0.2,5.1,9.1,15.825,68.6
depression_rate,396.0,16.28737,3.868286,10.7,13.0,14.5,19.725,30.0
no_health_ins_pe,396.0,7.207071,7.166621,0.0,2.775,5.0,9.2,56.1
gini_ind_inequality_e,396.0,-1683501.0,33501260.0,-666666700.0,0.356125,0.4047,0.47115,0.673
no_veh_pe,396.0,-1683481.0,33501260.0,-666666700.0,5.1,15.7,33.25,83.0
hs_grad_pe,396.0,88.22197,10.11174,18.3,84.175,90.6,95.0,100.0
broadband_pe,396.0,-1683418.0,33501260.0,-666666700.0,78.1,86.9,92.2,100.0
median_natwalkind,396.0,12.61553,3.24584,3.166667,11.0625,13.208333,14.833333,18.833333


Replace any values less than 0 as `NaN`. These features are not expected to have negative values.

In [4]:
df.loc[df['broadband_pe'] <0, 'broadband_pe'] = np.nan,
df.loc[df['gini_ind_inequality_e'] <0, 'gini_ind_inequality_e'] = np.nan,
df.loc[df['no_veh_pe'] <0, 'no_veh_pe'] = np.nan

Generate new variables for display in the maps.

In [5]:
df['GEOID'] = df['geoid_tract_20']
df['Percentage of population below poverty line'] = df['blw_pov_lvl_pe']
df['Depression rate'] = df['depression_rate']
df['Percentage of households without health insurance'] = df['no_health_ins_pe']
df['Gini inequality index'] = df['gini_ind_inequality_e']
df['Percentage of households without vehicles'] = df['no_veh_pe']
df['Percentage of Black residents'] = df['nh_black_alone_pe']
df['Percentage of residents with internet access'] = df['broadband_pe']
df['Percentage of residents with a high school diploma'] = df['hs_grad_pe']
df['Walkability score'] = df['median_natwalkind']

df['GEOID'] = df['GEOID'].astype(str)

Join the data with zip codes.

In [6]:
zcta = pd.read_csv('https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_tract20_natl.txt',
                   sep='|')

zcta.columns = zcta.columns.str.lower()
zcta['geoid_tract_20'] = zcta['geoid_tract_20'].astype(str)
zcta = zcta.dropna(subset=['geoid_zcta5_20'])
    
zcta = zcta[['geoid_tract_20','namelsad_zcta5_20']]
zcta.rename(columns={'geoid_tract_20':'GEOID','namelsad_zcta5_20':'Zip Code'},inplace=True)
    
zcta['Zip Code'] = [zip_code.split(' ')[1] for zip_code in zcta['Zip Code']]

In [7]:
df = df.merge(zcta, how = 'left', on = 'GEOID', indicator=True)
    
df.drop(columns='_merge',inplace=True)

Add data on the Wards associated with each zip code.

In [8]:
zips = pd.read_csv('../StartingWithToday/data/zips.csv')
    
df = df.merge(zips[['Zip Code','Ward']], how='left', on = 'Zip Code', indicator=True)

Format features in the data frame for display.

In [9]:
for var in [['Percentage of population below poverty line', 'Depression rate',
             'Percentage of households without health insurance',
             'Gini inequality index', 'Percentage of households without vehicles',
             'Percentage of Black residents','Percentage of residents with internet access',
             'Percentage of residents with a high school diploma','Walkability score','Zip Code','Ward']]:
    if var != 'Zip Code' and var != 'Ward':
        df[var] = round(df[var],2)
    if var != 'Zip Code' and var != 'Ward' and var != 'Needs Score':
        df[var] = df[var].astype(str)
        
for var in ['Gini inequality index', 'Percentage of households without vehicles',
            'Percentage of residents with internet access']:
    df.loc[df[var] == 'nan', var] = 'Data Not Available'

In [10]:
df.rename(columns={'Ward':'Ward/County'},inplace=True)
    
df.loc[df['countyname'] == "Prince George's", 'Ward/County'] = "Prince George's"
    
df.loc[df['Ward/County'] == "nan", 'Ward/County'] = "Data Not Available"

Export data for display in map.

In [11]:
df[['GEOID','Percentage of population below poverty line', 'Depression rate',
    'Percentage of households without health insurance','Gini inequality index',
    'Percentage of households without vehicles','Percentage of Black residents',
    'Percentage of residents with internet access','Percentage of residents with a high school diploma',
    'Walkability score', 'Zip Code', 'Ward/County','countyname']].drop_duplicates().to_csv('./data/display-data.csv',
                                                                                           index=False)

In [12]:
df = df.drop_duplicates()

Create a function to generate z-scores of the features needed in the needs score.

In [13]:
def update_df(county, df = df):
    df_cnty = df[df['countyname'] == county]
    df_cnty['hs_grad_pe_gap'] = df_cnty['hs_grad_pe'].max() - df_cnty['hs_grad_pe']
    df_cnty['broadband_pe_gap'] = df_cnty['broadband_pe'].max() - df_cnty['broadband_pe']
    df_cnty['median_natwalkind_gap'] = df_cnty['median_natwalkind'].max() - df_cnty['median_natwalkind']
    
    features = ['blw_pov_lvl_pe','depression_rate','no_health_ins_pe','gini_ind_inequality_e',
                'no_veh_pe','hs_grad_pe_gap','broadband_pe_gap','median_natwalkind_gap']
    
    df_cnty[['z_blw_pov_lvl_pe','z_depression_rate',
             'z_no_health_ins_pe','z_gini_ind_inequality_e',
             'z_no_veh_pe','z_hs_grad_pe_gap',
             'z_broadband_pe_gap','z_median_natwalkind_gap']] = StandardScaler().fit_transform(df_cnty[features])
    
    return df_cnty

Export the data to a CSV to be ingested by a Python script.

In [14]:
update_df('District of Columbia').to_csv('../StartingWithToday/data/needs_score_dc.csv',index=False)

update_df("Prince George's").to_csv('../StartingWithToday/data/needs_score_md.csv',index=False)