## This notebook creates the following metrics within the Society & Economy domain sourced from CalEnviroScreen:
* Age-adjusted emergency department visits for asthma per 10,000 people
* Age-adjusted emergency department visits for myocardial infarction per 10,000 people
* % of live, singleton births < 5.5 pounds (non-twin, including premature)
* % of population 25 and older with less than a high school education
* % of households where all members 14 and older have some difficult speaking English
* % of population living below 2x federal poverty level
* % of population > 16 years old unemployed and eligible for the workforce
* % of households which are low-income and housing-burdened
* percentile of drinking water 

In [1]:
import pandas as pd
import os
import sys
import math
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull .xlsx from aws
enviroscreen_excel = 's3://ca-climate-index/1_pull_data/society_economy/vulnerable_populations/ca_enviro_screen/calenviroscreen.xlsx'
enviroscreen_data = pd.read_excel(enviroscreen_excel,converters={'Census Tract': '{:0>11}'.format})

In [3]:
enviroscreen_data

Unnamed: 0,Census Tract,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,CES 4.0 Percentile Range,...,Linguistic Isolation Pctl,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl
0,06019001100,2780,Fresno,93706,Fresno,-119.781696,36.709695,93.183570,100.000000,95-100% (highest scores),...,79.374746,76.0,98.919598,12.8,93.831338,30.3,91.039290,93.155109,9.663213,99.722642
1,06077000700,4680,San Joaquin,95206,Stockton,-121.287873,37.943173,86.653790,99.987393,95-100% (highest scores),...,95.533902,73.2,98.391960,19.8,99.206143,31.2,92.281369,93.165408,9.664281,99.735250
2,06037204920,2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,95-100% (highest scores),...,81.553661,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208
3,06019000700,3664,Fresno,93706,Fresno,-119.827707,36.734535,81.327940,99.962179,95-100% (highest scores),...,78.711598,65.7,95.351759,15.7,97.345133,35.4,96.413181,94.641227,9.817371,99.886536
4,06019000200,2689,Fresno,93706,Fresno,-119.805504,36.735491,80.745476,99.949571,95-100% (highest scores),...,86.561104,72.7,98.304020,13.7,95.288912,32.7,94.157161,95.398873,9.895964,99.949571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8030,06107004000,582,Tulare,93257,Porterville,-118.983849,36.038061,,,,...,,79.6,99.422111,,,,,,,
8031,06109985202,2509,Tuolumne,95327,Unincorporated Tuolumne County area,-120.537071,37.891939,,,,...,,,,,,,,,,
8032,06111001206,778,Ventura,93001,Unincorporated Ventura County area,-119.371944,34.343903,,,,...,,17.1,27.349246,,,24.4,78.466413,,,
8033,06111003012,675,Ventura,93036,Oxnard,-119.180105,34.235076,,,,...,99.553390,96.7,100.000000,,,,,,,


## Now we pull and inspect a separate CalEnviroScreen datafile for drinking water percentiles:
https://oehha.ca.gov/calenviroscreen/indicator/drinking-water-contaminants
* it has the same number of census tracts (unsurprisingly), so we isolate the relevant columns and merge it with the
rest of the CalEnviroScreen data

In [4]:
# pull .xlsx from aws
enviroscreen_water_data = 's3://ca-climate-index/1_pull_data/society_economy/vulnerable_populations/ca_enviro_screen/ces4finaldrinkingwaterdatabytract.xlsx'
enviroscreen_water_data = pd.read_excel(enviroscreen_water_data)
print(len(enviroscreen_water_data))

8035


In [5]:
enviroscreen_water_data.columns

Index(['CensusTract', 'Total Population', 'California County', 'ZIP',
       'Approximate Location', 'Longitude', 'Latitude', 'Drinking Water Score',
       'Drinking Water Score Percentile', 'Arsenic', 'Arsenic Pctl', 'Cadmium',
       'Cadmium Pctl', 'DBCP', 'DBCP Pctl', 'EDB', 'EDB Pctl', 'Gross Alpha',
       'Gross Alpha Pctl', 'HAAs', 'HAAs Pctl', 'HexChrom', 'HexChrom Pctl',
       'Nitrate', 'Nitrate Pctl', 'PCE', 'PCE Pctl', 'Perchlorate',
       'Perchlorate Pctl', 'TCE', 'TCE Pctl', 'TCP', 'TCP Pctl', 'TTHM',
       'TTHM Pctl', 'Lead 90th', 'Lead 90th Pctl', 'MCL_AL Violations',
       'MCL_AL Violations Pctl', 'TCR Violations', 'TCR Violations Pctl'],
      dtype='object')

In [6]:
# Rename and isolate columns so we can merge
# Adjust tract entries to match the base data to merge
enviroscreen_water_data = enviroscreen_water_data.rename(columns={'CensusTract':'Census Tract'})
enviroscreen_water_data['Census Tract'] = enviroscreen_water_data['Census Tract'].astype(str).str.zfill(11)
enviroscreen_water_data = enviroscreen_water_data[['Census Tract', 'Drinking Water Score Percentile']]

In [7]:
# Merge the water percentile data with the rest of the enviroscreen data
merged_enviroscreen_data = pd.merge(enviroscreen_data, enviroscreen_water_data, on='Census Tract', how='left')
merged_enviroscreen_data

Unnamed: 0,Census Tract,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,CES 4.0 Percentile Range,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,Drinking Water Score Percentile
0,06019001100,2780,Fresno,93706,Fresno,-119.781696,36.709695,93.183570,100.000000,95-100% (highest scores),...,76.0,98.919598,12.8,93.831338,30.3,91.039290,93.155109,9.663213,99.722642,84.388660
1,06077000700,4680,San Joaquin,95206,Stockton,-121.287873,37.943173,86.653790,99.987393,95-100% (highest scores),...,73.2,98.391960,19.8,99.206143,31.2,92.281369,93.165408,9.664281,99.735250,41.551143
2,06037204920,2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,95-100% (highest scores),...,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208,92.531535
3,06019000700,3664,Fresno,93706,Fresno,-119.827707,36.734535,81.327940,99.962179,95-100% (highest scores),...,65.7,95.351759,15.7,97.345133,35.4,96.413181,94.641227,9.817371,99.886536,84.388660
4,06019000200,2689,Fresno,93706,Fresno,-119.805504,36.735491,80.745476,99.949571,95-100% (highest scores),...,72.7,98.304020,13.7,95.288912,32.7,94.157161,95.398873,9.895964,99.949571,84.388660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8030,06107004000,582,Tulare,93257,Porterville,-118.983849,36.038061,,,,...,79.6,99.422111,,,,,,,,71.150244
8031,06109985202,2509,Tuolumne,95327,Unincorporated Tuolumne County area,-120.537071,37.891939,,,,...,,,,,,,,,,42.725116
8032,06111001206,778,Ventura,93001,Unincorporated Ventura County area,-119.371944,34.343903,,,,...,17.1,27.349246,,,24.4,78.466413,,,,44.810791
8033,06111003012,675,Ventura,93036,Oxnard,-119.180105,34.235076,,,,...,96.7,100.000000,,,,,,,,72.923692


## The data is using older tract data, so we will join it with 2017 Tract data first

In [8]:
# read in CA census tiger file
old_census_path = "s3://ca-climate-index/0_map_data/tl_2017_06_tract/"
ca_old = gpd.read_file(old_census_path)
ca_old = ca_old.rename(columns={"GEOID":"Census Tract"})
ca_old = ca_old[["Census Tract","geometry"]]

In [9]:
old_tract_calenviroscreen_data = pd.merge(ca_old, merged_enviroscreen_data, on="Census Tract")
old_tract_calenviroscreen_data = gpd.GeoDataFrame(old_tract_calenviroscreen_data, geometry="geometry")

## Now call in 2021 census data

In [10]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
ca_boundaries = ca_boundaries.rename(columns={'GEOID':'Census Tract'})
# drop unnecessary columns
ca_boundaries = ca_boundaries[["geometry","Census Tract"]]
ca_boundaries

Unnamed: 0,geometry,Census Tract
0,"POLYGON ((-121.87556 37.39924, -121.87535 37.3...",06085504321
1,"POLYGON ((-121.88886 37.40758, -121.88576 37.4...",06085504410
2,"POLYGON ((-122.02489 37.21683, -122.02459 37.2...",06085507003
3,"POLYGON ((-121.99304 37.22562, -121.99249 37.2...",06085507004
4,"POLYGON ((-121.93167 37.29803, -121.92801 37.3...",06085502204
...,...,...
9124,"POLYGON ((-117.95917 33.92458, -117.95888 33.9...",06059001303
9125,"POLYGON ((-117.95918 33.92820, -117.95831 33.9...",06059001304
9126,"POLYGON ((-117.95056 33.94503, -117.95055 33.9...",06059001401
9127,"POLYGON ((-122.34551 37.96355, -122.34550 37.9...",06013367200


In [11]:
# need to convert to an area-preserving CRS for distance calculations
old_tract_calenviroscreen_data = old_tract_calenviroscreen_data.to_crs(crs=3857) 
ca_boundaries = ca_boundaries.to_crs(crs=3857) 
print(len(ca_boundaries['Census Tract'].unique()))

9129


In [12]:
# first find the tracts which have not changed from 2010 to 2017
# find the indices which correspond to the new boundaries
unchanged_tracts_ca = pd.to_numeric(ca_boundaries['Census Tract']).isin(pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']))
ca_boundaries[unchanged_tracts_ca]

Unnamed: 0,geometry,Census Tract
0,"POLYGON ((-13567125.366 4494902.743, -13567102...",06085504321
1,"POLYGON ((-13568606.361 4496070.766, -13568261...",06085504410
80,"POLYGON ((-13610032.137 4542456.650, -13609960...",06001428301
81,"POLYGON ((-13610111.953 4542843.479, -13610095...",06001428302
82,"POLYGON ((-13134046.655 4012084.036, -13133879...",06059001801
...,...,...
9123,"POLYGON ((-13620661.625 4574732.401, -13620650...",06013366002
9124,"POLYGON ((-13131155.246 4018679.418, -13131122...",06059001303
9125,"POLYGON ((-13131155.692 4019165.343, -13131058...",06059001304
9126,"POLYGON ((-13130196.230 4021423.785, -13130195...",06059001401


In [13]:
# now find the indices which correspond to the original data
unchanged_tracts_old = pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']).isin(pd.to_numeric(ca_boundaries['Census Tract']))
original_df = old_tract_calenviroscreen_data[unchanged_tracts_old]
original_df["Census Tract"] = original_df["Census Tract"].apply(lambda x: '{0:>13}'.format(x))
original_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,Census Tract,geometry,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,Drinking Water Score Percentile
0,06001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",3048,Alameda,94536,Fremont,-122.008111,37.537154,12.891690,20.196672,...,5.1,1.645729,3.4,22.566372,3.3,0.392902,27.644782,2.867662,16.729702,8.280255
1,06001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",2952,Alameda,94538,Fremont,-121.993103,37.529362,21.730953,41.616238,...,10.1,10.439698,4.4,36.439355,9.9,13.561470,47.489498,4.926204,46.457388,8.280255
2,06037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,...,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208,92.531535
3,06037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",3904,Los Angeles,90023,Los Angeles,-118.214298,34.024506,70.126624,99.104892,...,60.8,92.198492,5.0,44.351900,28.5,87.883397,81.963948,8.502325,94.452849,92.531535
4,06037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",3548,Los Angeles,90023,Los Angeles,-118.211796,34.018755,75.724241,99.747857,...,82.3,99.623116,3.3,21.108798,26.1,82.788340,80.189551,8.318263,92.713061,92.531535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,06013366002,"POLYGON ((-13620661.625 4574732.401, -13620650...",6627,Contra Costa,94806,San Pablo,-122.350933,37.969004,50.621809,88.464448,...,41.3,71.092965,7.9,74.739719,14.8,37.477820,75.764900,7.859283,87.670197,4.208817
8029,06059001303,"POLYGON ((-13131155.246 4018679.418, -13131122...",5884,Orange,90631,La Habra,-117.951167,33.920901,38.845479,73.260212,...,28.3,51.092965,6.6,63.417491,7.6,5.817490,49.795859,5.165449,49.974786,95.578868
8030,06059001304,"POLYGON ((-13131155.692 4019165.343, -13131058...",3982,Orange,90631,La Habra,-117.945541,33.924438,54.822235,92.372668,...,46.9,78.454774,14.7,96.577303,17.7,52.610900,66.358412,6.883524,74.899143,95.578868
8031,06059001401,"POLYGON ((-13130196.230 4021423.785, -13130195...",4495,Orange,90631,La Habra,-117.941044,33.941997,40.236731,75.365608,...,37.6,65.540201,8.6,78.969287,25.6,81.837769,64.449935,6.685553,72.175996,95.004371


In [14]:
# now we only have to join the remaining tracts
mapped_df = gpd.sjoin_nearest(
    ca_boundaries[~unchanged_tracts_ca], 
    old_tract_calenviroscreen_data[~unchanged_tracts_old], 
    how="inner", distance_col="distances", 
    max_distance=5000
)
mapped_df = mapped_df.rename(columns={'Census Tract_1':'Census Tract'})
# remove unnecessary columns
mapped_df = mapped_df.drop(
    columns=[col for col in mapped_df.columns if col not in original_df.columns]
)
mapped_df

Unnamed: 0,geometry,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,CES 4.0 Percentile Range,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,Drinking Water Score Percentile
2,"POLYGON ((-13583749.029 4469373.052, -13583714...",7095,Santa Clara,95030,Unincorporated Santa Clara County area,-121.993428,37.221238,5.889693,4.488149,1-5% (lowest scores),...,6.6,3.542714,4.3,35.020822,10.5,15.855513,13.308393,1.380513,2.660111,19.170726
3,"POLYGON ((-13580203.383 4470602.095, -13580142...",7095,Santa Clara,95030,Unincorporated Santa Clara County area,-121.993428,37.221238,5.889693,4.488149,1-5% (lowest scores),...,6.6,3.542714,4.3,35.020822,10.5,15.855513,13.308393,1.380513,2.660111,19.170726
4,"POLYGON ((-13573372.106 4480730.095, -13572964...",7541,Santa Clara,95126,San Jose,-121.920656,37.304720,15.388020,26.437216,25-30%,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,22.742600
5,"POLYGON ((-13572402.399 4481399.439, -13572401...",7541,Santa Clara,95126,San Jose,-121.920656,37.304720,15.388020,26.437216,25-30%,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,22.742600
6,"POLYGON ((-13571530.874 4483014.269, -13571528...",7541,Santa Clara,95126,San Jose,-121.920656,37.304720,15.388020,26.437216,25-30%,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,22.742600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,"POLYGON ((-13618295.631 4518847.147, -13618278...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,75-80%,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
8553,"POLYGON ((-13617244.772 4518845.876, -13617243...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,75-80%,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
8560,"POLYGON ((-13619131.420 4519518.650, -13619130...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,75-80%,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
9128,"POLYGON ((-13149442.843 4000257.235, -13149307...",133,Los Angeles,90815,Long Beach,-118.118648,33.778436,,,,...,,,,,,,,,,34.732109


In [15]:
# then concatenate the sjoined tracts with the unchanged ones
joined_df = pd.concat([original_df,mapped_df])
joined_df

Unnamed: 0,Census Tract,geometry,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,Drinking Water Score Percentile
0,06001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",3048,Alameda,94536,Fremont,-122.008111,37.537154,12.891690,20.196672,...,5.1,1.645729,3.4,22.566372,3.3,0.392902,27.644782,2.867662,16.729702,8.280255
1,06001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",2952,Alameda,94538,Fremont,-121.993103,37.529362,21.730953,41.616238,...,10.1,10.439698,4.4,36.439355,9.9,13.561470,47.489498,4.926204,46.457388,8.280255
2,06037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,...,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208,92.531535
3,06037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",3904,Los Angeles,90023,Los Angeles,-118.214298,34.024506,70.126624,99.104892,...,60.8,92.198492,5.0,44.351900,28.5,87.883397,81.963948,8.502325,94.452849,92.531535
4,06037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",3548,Los Angeles,90023,Los Angeles,-118.211796,34.018755,75.724241,99.747857,...,82.3,99.623116,3.3,21.108798,26.1,82.788340,80.189551,8.318263,92.713061,92.531535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,,"POLYGON ((-13618295.631 4518847.147, -13618278...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
8553,,"POLYGON ((-13617244.772 4518845.876, -13617243...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
8560,,"POLYGON ((-13619131.420 4519518.650, -13619130...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,23.441988
9128,,"POLYGON ((-13149442.843 4000257.235, -13149307...",133,Los Angeles,90815,Long Beach,-118.118648,33.778436,,,...,,,,,,,,,,34.732109


In [16]:
# select relevant columns
metric_enviroscreen_data = merged_enviroscreen_data[['Census Tract',
                                                'Asthma',
                                                'Low Birth Weight', 
                                                'Cardiovascular Disease', 
                                                'Education', 
                                                'Linguistic Isolation',
                                                'Poverty',
                                                'Unemployment', 
                                                'Housing Burden', 
                                                'Drinking Water Score Percentile'
                                                ]]
calenviroscreen_2019 = joined_df[metric_enviroscreen_data.columns]
calenviroscreen_2019

Unnamed: 0,Census Tract,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Drinking Water Score Percentile
0,06001442700,34.20,5.91,9.84,2.6,2.5,5.1,3.4,3.3,8.280255
1,06001442800,42.72,7.05,12.38,10.5,9.8,10.1,4.4,9.9,8.280255
2,06037204920,76.10,7.11,20.87,52.2,17.1,62.6,6.4,20.3,92.531535
3,06037205110,76.10,5.68,20.87,60.7,26.3,60.8,5.0,28.5,92.531535
4,06037205120,76.10,5.88,20.87,59.0,22.4,82.3,3.3,26.1,92.531535
...,...,...,...,...,...,...,...,...,...,...
8552,,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,23.441988
8553,,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,23.441988
8560,,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,23.441988
9128,,23.80,,12.66,,,,,,34.732109


In [17]:
# last, get things down to the level of the newer census tracts
calenviroscreen_2019 = calenviroscreen_2019.groupby('Census Tract').mean().reset_index()
calenviroscreen_2019

Unnamed: 0,Census Tract,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Drinking Water Score Percentile
0,06001400100,15.65,3.85,5.24,3.3,1.2,10.4,,11.2,4.208817
1,06001400200,20.47,4.05,8.14,0.4,0.0,10.6,3.0,4.0,4.208817
2,06001400300,30.88,3.78,8.88,5.6,8.0,10.3,3.9,8.9,4.208817
3,06001400400,49.61,4.44,8.08,4.8,0.9,21.1,2.5,14.8,4.208817
4,06001400500,86.57,3.64,11.13,2.3,1.7,21.9,3.8,14.8,4.208817
...,...,...,...,...,...,...,...,...,...,...
6853,06115040500,42.65,4.73,23.36,32.6,,50.9,13.8,20.1,41.126514
6854,06115040600,44.37,4.78,24.72,30.7,10.2,45.9,7.4,15.8,23.229674
6855,06115040800,35.41,4.82,15.27,13.4,0.0,17.0,8.2,5.3,61.358811
6856,06115040901,50.32,2.87,16.27,17.3,2.8,38.1,6.3,20.2,96.403147


## Adjust previously used dfs that contain county and population data and merge them back in to our data

In [18]:
tract_county = joined_df[['Census Tract', 'California County']]
tract_county = tract_county.rename(columns={'California County':'County'})
tract_county

Unnamed: 0,Census Tract,County
0,06001442700,Alameda
1,06001442800,Alameda
2,06037204920,Los Angeles
3,06037205110,Los Angeles
4,06037205120,Los Angeles
...,...,...
8552,,San Mateo
8553,,San Mateo
8560,,San Mateo
9128,,Los Angeles


In [19]:
population = joined_df[['Census Tract', 'Total Population']]
population

Unnamed: 0,Census Tract,Total Population
0,06001442700,3048
1,06001442800,2952
2,06037204920,2751
3,06037205110,3904
4,06037205120,3548
...,...,...
8552,,7788
8553,,7788
8560,,7788
9128,,133


In [20]:
# Merging data to get population and county data
calenviroscreen_2019_final = pd.merge(tract_county, calenviroscreen_2019, on='Census Tract', how='right')
calenviroscreen_2019_final = pd.merge(calenviroscreen_2019_final, population, on='Census Tract', how='left')
calenviroscreen_2019_final = calenviroscreen_2019_final.rename(columns={'Total Population': 'Total Population 2019'})

In [21]:
calenviroscreen_2019_final

Unnamed: 0,Census Tract,County,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Drinking Water Score Percentile,Total Population 2019
0,06001400100,Alameda,15.65,3.85,5.24,3.3,1.2,10.4,,11.2,4.208817,3120
1,06001400200,Alameda,20.47,4.05,8.14,0.4,0.0,10.6,3.0,4.0,4.208817,2007
2,06001400300,Alameda,30.88,3.78,8.88,5.6,8.0,10.3,3.9,8.9,4.208817,5051
3,06001400400,Alameda,49.61,4.44,8.08,4.8,0.9,21.1,2.5,14.8,4.208817,4007
4,06001400500,Alameda,86.57,3.64,11.13,2.3,1.7,21.9,3.8,14.8,4.208817,4124
...,...,...,...,...,...,...,...,...,...,...,...,...
6853,06115040500,Yuba,42.65,4.73,23.36,32.6,,50.9,13.8,20.1,41.126514,4052
6854,06115040600,Yuba,44.37,4.78,24.72,30.7,10.2,45.9,7.4,15.8,23.229674,5702
6855,06115040800,Yuba,35.41,4.82,15.27,13.4,0.0,17.0,8.2,5.3,61.358811,4652
6856,06115040901,Yuba,50.32,2.87,16.27,17.3,2.8,38.1,6.3,20.2,96.403147,2720


In [22]:
calenviroscreen_2019_final.to_csv('society_calenviroscreen_metric.csv')

### Function Call
The function below creates new df's for each metric listed below. Some metrics are already in percent from the 2019 data, so those columns are renamed and retained for Cal-CRAI metric. df's are saved as csv's named off of their metric column:

ones that are already in percent from 2019 data
* % of live, singleton births < 5.5 pounds (non-twin, including premature)
* % of population 25 and older with less than a high school education
* % of households where all members 14 and older have some difficult speaking English
* % of population living below 2x federal poverty level
* % of population > 16 years old unemployed and eligible for the workforce
* % of households which are low-income and housing-burdened

metric calculated as a percentile:
* Drinking Water Score Percentile 

The function can also calculate metric per 10,000 people for metrics that have a 'sum of' column rather than pre-baked in percentages:

metrics that have been calculated per 10,000 people:
* Age-adjusted emergency department visits for asthma per 10,000 people
* Age-adjusted emergency department visits for myocardial infarction per 10,000 people

Asthma and cardiovascular percentage can be calculated with 2019 and 2021 as the CalEnviroscreen values are 'Age-adjusted rate of emergency department visits for asthma/cardiovascular disease'

# Calling function for both metric calc types

In [25]:
#@append_metadata
def calenviroscreen_metric_calc(input_csv, columns_to_process, calculate_per_10000=False, export=False, varname=""):
    '''
    Calculates the following metrics sourced from CalEnviroScreen:
    * % of live, singleton births < 5.5 pounds (non-twin, including premature)
    * % of population 25 and older with less than a high school education
    * % of households where all members 14 and older have some difficult speaking English
    * % of population living below 2x federal poverty level
    * % of population > 16 years old unemployed and eligible for the workforce
    * % of households which are low-income and housing-burdened
    * Age-adjusted emergency department visits for asthma per 10,000 people
    * Age-adjusted emergency department visits for myocardial infarction per 10,000 people
    * Drinking Water Score Percentile

    Note
    --------
    Each of the above metrics is calculated separately; please see the corresponding 
    variable name (the same as the filename for this document) to know which one this 
    particular metadata document describes. 
  
    Methods
    --------
    Relevant data columns were isolated and renamed to align with Cal-CRAI metrics.
    Data was from older census tracts, so we merged it with 2017 California Tiger shape files first.
    The data was then set to Cal-CRAI standardized coordinate reference system.
    Data was then spatially joined to nearest 2021 census tract data.
    Extra tracts merged in were given the average value for each metric based on 
    the county they reside in.
    This averaging was also done for missing data in otherwise populated tracts.
    Metrics with % calculations were largely untouched as CalEnviroScreen data had
    those metrics calculated for 2019.
    Metrics with emergency department visits had their values adjusted to reflect
    number of visits per 10,000 people per tract with the 2019 population data.

    Parameters
    ------------
    columns_to_process: list
        list of columns that contain desired metric data
    calculate_per_10000: boolean
        if true, adds columns with calculations for # of visits per 10,000 people
        if false, retains the column but renames to 2019
    varname: string
        Final metric name.
    export: bool
        If True, uploads file to S3.
        If False, just generates metadata file.

    Script
    ------
    cal_enviroscreen_metrics.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    merged_df = pd.read_csv(input_csv)
        
    # List to store generated CSV file names
    csv_file_names = []
        
    for column in columns_to_process:
        # Create new DataFrame
        new_df = merged_df[['Census Tract', 'County', 'Total Population 2019']].copy()
        new_df = new_df.rename(columns={'Census Tract': 'census_tract'})  
        # Create new column name
        if column == 'Drinking Water Score Percentile':
            new_column_name = column.replace(' ', '_')
        else:
            new_column_name = column.replace(' ', '_')
            if calculate_per_10000:
                new_column_name += '_related_ED_visits_2019'
                new_column_name_per_10000_people_2019 = new_column_name.replace('_2019', '_per_10000_people_2019')
            else:
                new_column_name += '_percent_2019'

        # Lowercase the column name
        new_column_name = new_column_name.lower()
    
        # Add new column with the calculated name
        if not calculate_per_10000:
            new_df[new_column_name] = merged_df[column]

            print('Data transformation: adding calculation columns for metrics with emergency department visits.')

        else:
            new_df[column] = merged_df[column]
            new_df[new_column_name_per_10000_people_2019] = (merged_df[column] / merged_df['Total Population 2019']) * 10000
            print('Data transformation: adding calculation columns for metrics with emergency department visits.')
        
        if not calculate_per_10000:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'
        else:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'

        # Save the DataFrame to CSV
        new_df.columns = new_df.columns.str.lower()
        new_df.to_csv(csv_filename, index=False)
        
        print(f"Saved DataFrame to: {csv_filename}")
        # Append CSV filename to the list
        csv_file_names.append(csv_filename)
        # Output or further process new DataFrame
        display(new_df)
        
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        upload_csv_aws([csv_filename], bucket_name, directory)

    if export == False:
        print(f'{csv_filename} uploaded to AWS.')

In [26]:
input_csv = 'society_calenviroscreen_metric.csv'

# Lists of columns and varnames
columns_to_process_no_10000 = [
    'Low Birth Weight',
    'Education',
    'Linguistic Isolation',
    'Poverty',
    'Unemployment',
    'Housing Burden',
    'Drinking Water Score Percentile'
]
varnames_no_10000 = [
    'society_calenviroscreen_birth_weight', 
    'society_calenviroscreen_low_education', 
    'society_calenviroscreen_nonenglish_speakers',
    'society_calenviroscreen_below_poverty_level',
    'society_calenviroscreen_unemployment',
    'society_calenviroscreen_housing_burdened',
    'society_calenviroscreen_drinking_water_percentile'
]

# Calculate metric without percentages
for col, var in zip(columns_to_process_no_10000, varnames_no_10000):
    print(f"Processing {col} without percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=False, export=False, varname=var)

varnames_10000 = [
    'society_calenviroscreen_emergency_dept_visits',
    'society_calenviroscreen_emergency_dept_myocardial_visits'
]

# Columns to loop through that include calculating percentages
columns_to_process_per_10000 = [
    'Asthma',
    'Cardiovascular Disease'
]

# Calculate percentages
for col, var in zip(columns_to_process_per_10000, varnames_10000):
    print(f"Processing {col} with percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=True, export=False, varname='test')

Processing Low Birth Weight without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_low_birth_weight_metric.csv


Unnamed: 0,census_tract,county,total population 2019,low_birth_weight_percent_2019
0,6001400100,Alameda,3120,3.85
1,6001400200,Alameda,2007,4.05
2,6001400300,Alameda,5051,3.78
3,6001400400,Alameda,4007,4.44
4,6001400500,Alameda,4124,3.64
...,...,...,...,...
6853,6115040500,Yuba,4052,4.73
6854,6115040600,Yuba,5702,4.78
6855,6115040800,Yuba,4652,4.82
6856,6115040901,Yuba,2720,2.87


society_vulnerable_low_birth_weight_metric.csv uploaded to AWS.
Processing Education without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_education_metric.csv


Unnamed: 0,census_tract,county,total population 2019,education_percent_2019
0,6001400100,Alameda,3120,3.3
1,6001400200,Alameda,2007,0.4
2,6001400300,Alameda,5051,5.6
3,6001400400,Alameda,4007,4.8
4,6001400500,Alameda,4124,2.3
...,...,...,...,...
6853,6115040500,Yuba,4052,32.6
6854,6115040600,Yuba,5702,30.7
6855,6115040800,Yuba,4652,13.4
6856,6115040901,Yuba,2720,17.3


society_vulnerable_education_metric.csv uploaded to AWS.
Processing Linguistic Isolation without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_linguistic_isolation_metric.csv


Unnamed: 0,census_tract,county,total population 2019,linguistic_isolation_percent_2019
0,6001400100,Alameda,3120,1.2
1,6001400200,Alameda,2007,0.0
2,6001400300,Alameda,5051,8.0
3,6001400400,Alameda,4007,0.9
4,6001400500,Alameda,4124,1.7
...,...,...,...,...
6853,6115040500,Yuba,4052,
6854,6115040600,Yuba,5702,10.2
6855,6115040800,Yuba,4652,0.0
6856,6115040901,Yuba,2720,2.8


society_vulnerable_linguistic_isolation_metric.csv uploaded to AWS.
Processing Poverty without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_poverty_metric.csv


Unnamed: 0,census_tract,county,total population 2019,poverty_percent_2019
0,6001400100,Alameda,3120,10.4
1,6001400200,Alameda,2007,10.6
2,6001400300,Alameda,5051,10.3
3,6001400400,Alameda,4007,21.1
4,6001400500,Alameda,4124,21.9
...,...,...,...,...
6853,6115040500,Yuba,4052,50.9
6854,6115040600,Yuba,5702,45.9
6855,6115040800,Yuba,4652,17.0
6856,6115040901,Yuba,2720,38.1


society_vulnerable_poverty_metric.csv uploaded to AWS.
Processing Unemployment without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_unemployment_metric.csv


Unnamed: 0,census_tract,county,total population 2019,unemployment_percent_2019
0,6001400100,Alameda,3120,
1,6001400200,Alameda,2007,3.0
2,6001400300,Alameda,5051,3.9
3,6001400400,Alameda,4007,2.5
4,6001400500,Alameda,4124,3.8
...,...,...,...,...
6853,6115040500,Yuba,4052,13.8
6854,6115040600,Yuba,5702,7.4
6855,6115040800,Yuba,4652,8.2
6856,6115040901,Yuba,2720,6.3


society_vulnerable_unemployment_metric.csv uploaded to AWS.
Processing Housing Burden without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_housing_burden_metric.csv


Unnamed: 0,census_tract,county,total population 2019,housing_burden_percent_2019
0,6001400100,Alameda,3120,11.2
1,6001400200,Alameda,2007,4.0
2,6001400300,Alameda,5051,8.9
3,6001400400,Alameda,4007,14.8
4,6001400500,Alameda,4124,14.8
...,...,...,...,...
6853,6115040500,Yuba,4052,20.1
6854,6115040600,Yuba,5702,15.8
6855,6115040800,Yuba,4652,5.3
6856,6115040901,Yuba,2720,20.2


society_vulnerable_housing_burden_metric.csv uploaded to AWS.
Processing Drinking Water Score Percentile without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_drinking_water_score_percentile_metric.csv


Unnamed: 0,census_tract,county,total population 2019,drinking_water_score_percentile
0,6001400100,Alameda,3120,4.208817
1,6001400200,Alameda,2007,4.208817
2,6001400300,Alameda,5051,4.208817
3,6001400400,Alameda,4007,4.208817
4,6001400500,Alameda,4124,4.208817
...,...,...,...,...
6853,6115040500,Yuba,4052,41.126514
6854,6115040600,Yuba,5702,23.229674
6855,6115040800,Yuba,4652,61.358811
6856,6115040901,Yuba,2720,96.403147


society_vulnerable_drinking_water_score_percentile_metric.csv uploaded to AWS.
Processing Asthma with percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_asthma_metric.csv


Unnamed: 0,census_tract,county,total population 2019,asthma,asthma_related_ed_visits_per_10000_people_2019
0,6001400100,Alameda,3120,15.65,50.160256
1,6001400200,Alameda,2007,20.47,101.993024
2,6001400300,Alameda,5051,30.88,61.136409
3,6001400400,Alameda,4007,49.61,123.808335
4,6001400500,Alameda,4124,86.57,209.917556
...,...,...,...,...,...
6853,6115040500,Yuba,4052,42.65,105.256663
6854,6115040600,Yuba,5702,44.37,77.814802
6855,6115040800,Yuba,4652,35.41,76.117799
6856,6115040901,Yuba,2720,50.32,185.000000


society_vulnerable_asthma_metric.csv uploaded to AWS.
Processing Cardiovascular Disease with percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_cardiovascular_disease_metric.csv


Unnamed: 0,census_tract,county,total population 2019,cardiovascular disease,cardiovascular_disease_related_ed_visits_per_10000_people_2019
0,6001400100,Alameda,3120,5.24,16.794872
1,6001400200,Alameda,2007,8.14,40.558047
2,6001400300,Alameda,5051,8.88,17.580677
3,6001400400,Alameda,4007,8.08,20.164712
4,6001400500,Alameda,4124,11.13,26.988361
...,...,...,...,...,...
6853,6115040500,Yuba,4052,23.36,57.650543
6854,6115040600,Yuba,5702,24.72,43.353209
6855,6115040800,Yuba,4652,15.27,32.824592
6856,6115040901,Yuba,2720,16.27,59.816176


society_vulnerable_cardiovascular_disease_metric.csv uploaded to AWS.
