In [177]:
# Author: Yecheng Huang
# Updates by: Corrina Calanoc
import pandas as pd
import os

# Pull data on depression rates in each census tract in DC and PG County in MD

**Instructions:**
* Source: https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Place-Data-202/eav7-hnsx
* Saved in Google Drive here: https://drive.google.com/drive/folders/1EbmA0ctsWNaSIuINAJhZ1UVY3F9TGryM
  * Name: `PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2022_release.csv`
* for more data, visit https://www.cdc.gov/places/

In [178]:
# create file structure path so that code knows where to pull and save data
data_folder = os.path.join(os.getcwd(), "..", "raw_data")
all_tracts_df = pd.read_csv(os.path.join(data_folder, 'PLACES__Local_Data_for_Better_Health__Census_Tract_Data_2022_release.csv'))
print(len(all_tracts_df))

2161543


In [179]:
all_tracts_df[(all_tracts_df['StateAbbr']=='MD')].CountyName.unique()

array(['Montgomery', 'Allegany', 'Anne Arundel', 'Calvert', 'Carroll',
       'Cecil', 'Charles', 'Frederick', 'Garrett', 'Harford', 'Howard',
       "Prince George's", 'Baltimore City', 'Baltimore', 'Caroline',
       'Dorchester', 'Kent', 'Talbot', 'Worcester', "Queen Anne's",
       'Somerset', "St. Mary's", 'Washington', 'Wicomico'], dtype=object)

In [180]:
# set the subset year
subset_year = 2020

# ----- SET ALL THE FILTERS THAT WE'LL APPLY TO THE DATA -----
year_filter = (all_tracts_df.Year == subset_year)
md_filter = (all_tracts_df['StateAbbr']=='MD') & (all_tracts_df['CountyName']=="Prince George's")
dc_filter = (all_tracts_df['StateAbbr']=='DC')
location_filter = (md_filter | dc_filter)
measure_filter = (all_tracts_df.Measure.str.contains('Depress'))

# subset this to depression data for DC and PG county Maryland
depression_data = all_tracts_df[year_filter & location_filter & measure_filter].copy(deep=True)

# convert LocationID so that it's a string
depression_data['LocationID'] = depression_data['LocationID'].astype(str)

# sanity check
print(len(depression_data))
print(depression_data.LocationID.nunique())
print(depression_data.CountyName.unique())
print(depression_data.columns)
depression_data.head()

396
396
['District of Columbia' "Prince George's"]
Index(['Year', 'StateAbbr', 'StateDesc', 'CountyName', 'CountyFIPS',
       'LocationName', 'DataSource', 'Category', 'Measure', 'Data_Value_Unit',
       'Data_Value_Type', 'Data_Value', 'Data_Value_Footnote_Symbol',
       'Data_Value_Footnote', 'Low_Confidence_Limit', 'High_Confidence_Limit',
       'TotalPopulation', 'Geolocation', 'LocationID', 'CategoryID',
       'MeasureId', 'DataValueTypeID', 'Short_Question_Text'],
      dtype='object')


Unnamed: 0,Year,StateAbbr,StateDesc,CountyName,CountyFIPS,LocationName,DataSource,Category,Measure,Data_Value_Unit,...,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,Geolocation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text
409138,2020,DC,District of Columbia,District of Columbia,11001,11001009907,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,...,,19.6,22.8,2836,POINT (-76.93880536 38.88277252),11001009907,HLTHOUT,DEPRESSION,CrdPrv,Depression
409277,2020,DC,District of Columbia,District of Columbia,11001,11001008802,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,...,,17.1,19.7,4119,POINT (-76.98664548 38.90394304),11001008802,HLTHOUT,DEPRESSION,CrdPrv,Depression
409333,2020,DC,District of Columbia,District of Columbia,11001,11001005900,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,...,,20.5,23.9,2998,POINT (-77.01493498 38.89602518),11001005900,HLTHOUT,DEPRESSION,CrdPrv,Depression
409404,2020,DC,District of Columbia,District of Columbia,11001,11001002900,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,...,,18.2,20.7,3962,POINT (-77.02991535 38.93383163),11001002900,HLTHOUT,DEPRESSION,CrdPrv,Depression
409470,2020,DC,District of Columbia,District of Columbia,11001,11001003302,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,...,,17.6,19.5,2134,POINT (-77.0112185 38.91392309),11001003302,HLTHOUT,DEPRESSION,CrdPrv,Depression


# Join with Walkability Score
**Instructions:** Download this file: https://drive.google.com/file/d/1zsM0PqBa5usr1QK64BRZ4Z6-0S0V9kfy/view

* Data source: https://catalog.data.gov/dataset/walkability-index

Since this provides walkability scores on the **block** level (more granular than tract) and the rest of our data is on the **tract** level, we will be taking the median score across all blocks within one tract

In [181]:
walk_score = pd.read_csv(os.path.join(data_folder, "National Walkability Index Data.csv"))
# create the location id aka census tract id by concatenating state, county, and tract
walk_score['location_id'] = walk_score['STATEFP'].astype(str).str.zfill(2) + walk_score['COUNTYFP'].astype(str).str.zfill(3) + walk_score['TRACTCE'].astype(str).str.zfill(6)
# group by census tracts
median_walk_by_tract = walk_score.groupby(["location_id"])["NatWalkInd"].median().reset_index().rename({'NatWalkInd':'Median_NatWalkInd'}, axis='columns')
mean_walk_by_tract = walk_score.groupby(["location_id"])["NatWalkInd"].mean().reset_index().rename({'NatWalkInd':'Mean_NatWalkInd'}, axis='columns')

# Join with CRE Equity Data
**Instructions**: Download [this file](https://drive.google.com/file/d/1m_4iSNWri-zay_ltKeTS4ndoosMi7T8r/view?usp=share_link) and then save in the 'raw_data' folder created above. 
* Data source: https://www.census.gov/programs-surveys/community-resilience-estimates/data/supplement.html
  * Granularity = "Tract"

In [182]:
# read in the file
cre_equity = pd.read_csv(os.path.join(data_folder, 'CRE_Equity_Tract_19.csv'), encoding='latin1')
# get the last 11 characters of the GEO_ID which contains the census tract
cre_equity['location_id'] = cre_equity['GEO_ID'].apply(lambda x: x[-11:]).astype(str)

In [196]:
list(cre_equity.columns)
cre_equity.NAME.head()

0    Census Tract 201, Autauga County, Alabama
1    Census Tract 202, Autauga County, Alabama
2    Census Tract 203, Autauga County, Alabama
3    Census Tract 204, Autauga County, Alabama
4    Census Tract 205, Autauga County, Alabama
Name: NAME, dtype: object

In [183]:
# join the depression file and the census tract file
joined_file = depression_data.merge(cre_equity, left_on = 'LocationID', right_on = 'location_id', how = 'inner')

In [184]:
# join with the walkability data
joined_file = joined_file.merge(median_walk_by_tract, left_on='LocationID', right_on='location_id', how = 'left')
len(joined_file)

396

In [197]:
# joined_file.head()
list(joined_file.columns)
joined_file.NAME.head()
# depression_data.columns
# depression_data[['Measure', 'Data_Value_Unit',
#        'Data_Value_Type', 'Data_Value', 'Data_Value_Footnote_Symbol']]

0    Census Tract 99.07, District of Columbia, Dist...
1    Census Tract 88.02, District of Columbia, Dist...
2    Census Tract 59, District of Columbia, Distric...
3    Census Tract 29, District of Columbia, Distric...
4    Census Tract 33.02, District of Columbia, Dist...
Name: NAME, dtype: object

# Clean up for final file format

In [202]:
rename_cols = {'LocationID': 'census_tract_id',
                'NAME': 'census_tract_name',
                'Data_Value': 'depressed_perc',
                 'NH_Black_alone_PE':'black_non_hisp_perc',
                 'NH_White_alone_PE':'white_non_hisp_perc',
                 'Hispanic_PE':'hispanic_latino_perc',
                 'PRED3_PE':'3_plus_cre_risk_factors_perc',
                 'Blw_Pov_Lvl_PE':'below_poverty_level_perc',
                 'No_Health_Ins_PE':'no_health_insurance_perc',
                 'Male_PE':'male_perc',
                 'Female_PE':'female_perc',
                 'GINI_IND_Inequality_E':'income_inequality_gini_index',
                 'HS_Grad_PE':'hs_grad_perc',
                 'No_Veh_PE':'households_no_vehicle_perc',
                 'Broadband_PE':'households_w_internet_perc',
               'Median_NatWalkInd':'walkability_score'
                }

clean_joined_file = joined_file.rename(rename_cols, axis='columns')
clean_joined_file = clean_joined_file[rename_cols.values()]

In [206]:
# write file and then add / update to this Google Drive folder: https://drive.google.com/drive/folders/19RfOfSc8TWcXz4FyNEEg8ixJrkYcYJ0W
clean_joined_file.to_csv(os.path.join(data_folder, 'joined_depression_cre.csv'), index=False)
print(len(clean_joined_file))
clean_joined_file.head()

396


Unnamed: 0,census_tract_id,census_tract_name,depressed_perc,black_non_hisp_perc,white_non_hisp_perc,hispanic_latino_perc,3_plus_cre_risk_factors_perc,below_poverty_level_perc,no_health_insurance_perc,male_perc,female_perc,income_inequality_gini_index,hs_grad_perc,households_no_vehicle_perc,households_w_internet_perc,walkability_score
0,11001009907,"Census Tract 99.07, District of Columbia, Dist...",21.1,97.5,0.4,2.0,28.85,19.9,7.6,40.0,60.0,0.428,88.6,54.3,64.4,12.25
1,11001008802,"Census Tract 88.02, District of Columbia, Dist...",18.3,68.7,25.6,2.7,35.38,19.7,4.3,51.1,48.9,0.4983,86.0,34.2,72.8,13.5
2,11001005900,"Census Tract 59, District of Columbia, Distric...",22.1,22.4,50.8,8.6,17.12,27.7,4.6,52.6,47.4,0.5047,90.7,61.1,82.6,16.5
3,11001002900,"Census Tract 29, District of Columbia, Distric...",19.3,19.6,47.1,25.0,13.98,8.2,7.8,50.1,49.9,0.4526,85.6,38.2,88.4,16.166667
4,11001003302,"Census Tract 33.02, District of Columbia, Dist...",18.5,37.6,48.7,5.2,9.09,9.3,1.4,46.3,53.7,0.3496,97.3,12.4,89.9,15.75
