## This notebook creates the following metrics within the Society & Economy domain sourced from CalEnviroScreen:
* Age-adjusted emergency department visits for asthma per 10,000 people
* Age-adjusted emergency department visits for myocardial infarction per 10,000 people
* % of live, singleton births < 5.5 pounds (non-twin, including premature)
* % of population 25 and older with less than a high school education
* % of households where all members 14 and older have some difficult speaking English
* % of population living below 2x federal poverty level
* % of population > 16 years old unemployed and eligible for the workforce
* % of households which are low-income and housing-burdened
* number of impaired waterbodies 

In [1]:
import pandas as pd
import os
import sys
import math
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull .xlsx from aws
enviroscreen_excel = 's3://ca-climate-index/1_pull_data/society_economy/vulnerable_populations/ca_enviro_screen/calenviroscreen.xlsx'
enviroscreen_data = pd.read_excel(enviroscreen_excel)

In [3]:
enviroscreen_data

Unnamed: 0,Census Tract,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,CES 4.0 Percentile Range,...,Linguistic Isolation Pctl,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl
0,6019001100,2780,Fresno,93706,Fresno,-119.781696,36.709695,93.183570,100.000000,95-100% (highest scores),...,79.374746,76.0,98.919598,12.8,93.831338,30.3,91.039290,93.155109,9.663213,99.722642
1,6077000700,4680,San Joaquin,95206,Stockton,-121.287873,37.943173,86.653790,99.987393,95-100% (highest scores),...,95.533902,73.2,98.391960,19.8,99.206143,31.2,92.281369,93.165408,9.664281,99.735250
2,6037204920,2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,95-100% (highest scores),...,81.553661,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208
3,6019000700,3664,Fresno,93706,Fresno,-119.827707,36.734535,81.327940,99.962179,95-100% (highest scores),...,78.711598,65.7,95.351759,15.7,97.345133,35.4,96.413181,94.641227,9.817371,99.886536
4,6019000200,2689,Fresno,93706,Fresno,-119.805504,36.735491,80.745476,99.949571,95-100% (highest scores),...,86.561104,72.7,98.304020,13.7,95.288912,32.7,94.157161,95.398873,9.895964,99.949571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8030,6107004000,582,Tulare,93257,Porterville,-118.983849,36.038061,,,,...,,79.6,99.422111,,,,,,,
8031,6109985202,2509,Tuolumne,95327,Unincorporated Tuolumne County area,-120.537071,37.891939,,,,...,,,,,,,,,,
8032,6111001206,778,Ventura,93001,Unincorporated Ventura County area,-119.371944,34.343903,,,,...,,17.1,27.349246,,,24.4,78.466413,,,
8033,6111003012,675,Ventura,93036,Oxnard,-119.180105,34.235076,,,,...,99.553390,96.7,100.000000,,,,,,,


## The data is using older tract data, so we will join it with 2017 Tract data first

In [4]:
# read in CA census tiger file
old_census_path = "s3://ca-climate-index/0_map_data/tl_2017_06_tract/"
ca_old = gpd.read_file(old_census_path)
ca_old['Census Tract'] = pd.to_numeric(ca_old.GEOID)
ca_old = ca_old[["Census Tract","geometry"]]

In [5]:
old_tract_calenviroscreen_data = pd.merge(ca_old, enviroscreen_data, on="Census Tract")
old_tract_calenviroscreen_data = gpd.GeoDataFrame(old_tract_calenviroscreen_data, geometry="geometry")

## Now call in 2021 census data

In [6]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
ca_boundaries = ca_boundaries.rename(columns={'GEOID':'Census Tract'})
# drop unnecessary columns
ca_boundaries = ca_boundaries[["geometry","Census Tract"]]
ca_boundaries

Unnamed: 0,geometry,Census Tract
0,"POLYGON ((-121.87556 37.39924, -121.87535 37.3...",06085504321
1,"POLYGON ((-121.88886 37.40758, -121.88576 37.4...",06085504410
2,"POLYGON ((-122.02489 37.21683, -122.02459 37.2...",06085507003
3,"POLYGON ((-121.99304 37.22562, -121.99249 37.2...",06085507004
4,"POLYGON ((-121.93167 37.29803, -121.92801 37.3...",06085502204
...,...,...
9124,"POLYGON ((-117.95917 33.92458, -117.95888 33.9...",06059001303
9125,"POLYGON ((-117.95918 33.92820, -117.95831 33.9...",06059001304
9126,"POLYGON ((-117.95056 33.94503, -117.95055 33.9...",06059001401
9127,"POLYGON ((-122.34551 37.96355, -122.34550 37.9...",06013367200


In [7]:
# need to convert to an area-preserving CRS for distance calculations
old_tract_calenviroscreen_data = old_tract_calenviroscreen_data.to_crs(crs=3857) 
ca_boundaries = ca_boundaries.to_crs(crs=3857) 

In [8]:
# first find the tracts which have not changed from 2010 to 2017
# find the indices which correspond to the new boundaries
unchanged_tracts_ca = pd.to_numeric(ca_boundaries['Census Tract']).isin(pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']))
ca_boundaries[unchanged_tracts_ca]

Unnamed: 0,geometry,Census Tract
0,"POLYGON ((-13567125.366 4494902.743, -13567102...",06085504321
1,"POLYGON ((-13568606.361 4496070.766, -13568261...",06085504410
80,"POLYGON ((-13610032.137 4542456.650, -13609960...",06001428301
81,"POLYGON ((-13610111.953 4542843.479, -13610095...",06001428302
82,"POLYGON ((-13134046.655 4012084.036, -13133879...",06059001801
...,...,...
9123,"POLYGON ((-13620661.625 4574732.401, -13620650...",06013366002
9124,"POLYGON ((-13131155.246 4018679.418, -13131122...",06059001303
9125,"POLYGON ((-13131155.692 4019165.343, -13131058...",06059001304
9126,"POLYGON ((-13130196.230 4021423.785, -13130195...",06059001401


In [9]:
# now find the indices which correspond to the original data
unchanged_tracts_old = pd.to_numeric(old_tract_calenviroscreen_data['Census Tract']).isin(pd.to_numeric(ca_boundaries['Census Tract']))
original_df = old_tract_calenviroscreen_data[unchanged_tracts_old]
original_df["Census Tract"] = original_df["Census Tract"].apply(lambda x: '{0:>11}'.format(x))
original_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,Census Tract,geometry,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,...,Linguistic Isolation Pctl,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl
0,6001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",3048,Alameda,94536,Fremont,-122.008111,37.537154,12.891690,20.196672,...,19.867370,5.1,1.645729,3.4,22.566372,3.3,0.392902,27.644782,2.867662,16.729702
1,6001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",2952,Alameda,94538,Fremont,-121.993103,37.529362,21.730953,41.616238,...,61.090811,10.1,10.439698,4.4,36.439355,9.9,13.561470,47.489498,4.926204,46.457388
2,6037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,...,81.553661,62.6,93.391960,6.4,61.530453,20.3,63.967047,83.751814,8.687785,95.789208
3,6037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",3904,Los Angeles,90023,Los Angeles,-118.214298,34.024506,70.126624,99.104892,...,93.382054,60.8,92.198492,5.0,44.351900,28.5,87.883397,81.963948,8.502325,94.452849
4,6037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",3548,Los Angeles,90023,Los Angeles,-118.211796,34.018755,75.724241,99.747857,...,89.619705,82.3,99.623116,3.3,21.108798,26.1,82.788340,80.189551,8.318263,92.713061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,6013366002,"POLYGON ((-13620661.625 4574732.401, -13620650...",6627,Contra Costa,94806,San Pablo,-122.350933,37.969004,50.621809,88.464448,...,86.439302,41.3,71.092965,7.9,74.739719,14.8,37.477820,75.764900,7.859283,87.670197
8029,6059001303,"POLYGON ((-13131155.246 4018679.418, -13131122...",5884,Orange,90631,La Habra,-117.951167,33.920901,38.845479,73.260212,...,62.660712,28.3,51.092965,6.6,63.417491,7.6,5.817490,49.795859,5.165449,49.974786
8030,6059001304,"POLYGON ((-13131155.692 4019165.343, -13131058...",3982,Orange,90631,La Habra,-117.945541,33.924438,54.822235,92.372668,...,78.995805,46.9,78.454774,14.7,96.577303,17.7,52.610900,66.358412,6.883524,74.899143
8031,6059001401,"POLYGON ((-13130196.230 4021423.785, -13130195...",4495,Orange,90631,La Habra,-117.941044,33.941997,40.236731,75.365608,...,72.256056,37.6,65.540201,8.6,78.969287,25.6,81.837769,64.449935,6.685553,72.175996


In [10]:
# now we only have to join the remaining tracts
mapped_df = gpd.sjoin_nearest(
    ca_boundaries[~unchanged_tracts_ca], 
    old_tract_calenviroscreen_data[~unchanged_tracts_old], 
    how="inner", distance_col="distances", 
    max_distance=5000
)
mapped_df

Unnamed: 0,geometry,Census Tract_left,index_right,Census Tract_right,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,...,Poverty,Poverty Pctl,Unemployment,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,distances
2,"POLYGON ((-13583749.029 4469373.052, -13583714...",06085507003,3797,6085507001,7095,Santa Clara,95030,Unincorporated Santa Clara County area,-121.993428,37.221238,...,6.6,3.542714,4.3,35.020822,10.5,15.855513,13.308393,1.380513,2.660111,0.0
3,"POLYGON ((-13580203.383 4470602.095, -13580142...",06085507004,3797,6085507001,7095,Santa Clara,95030,Unincorporated Santa Clara County area,-121.993428,37.221238,...,6.6,3.542714,4.3,35.020822,10.5,15.855513,13.308393,1.380513,2.660111,0.0
4,"POLYGON ((-13573372.106 4480730.095, -13572964...",06085502204,3829,6085502201,7541,Santa Clara,95126,San Jose,-121.920656,37.304720,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,0.0
5,"POLYGON ((-13572402.399 4481399.439, -13572401...",06085502203,3829,6085502201,7541,Santa Clara,95126,San Jose,-121.920656,37.304720,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,0.0
6,"POLYGON ((-13571530.874 4483014.269, -13571528...",06085501902,3829,6085502201,7541,Santa Clara,95126,San Jose,-121.920656,37.304720,...,32.4,57.625628,6.8,65.629880,27.5,85.956907,48.260934,5.006227,47.793747,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,"POLYGON ((-13618295.631 4518847.147, -13618278...",06081605902,5398,6081606200,7788,San Mateo,94401,San Mateo,-122.322082,37.572721,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,0.0
8553,"POLYGON ((-13617244.772 4518845.876, -13617243...",06081606202,5398,6081606200,7788,San Mateo,94401,San Mateo,-122.322082,37.572721,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,0.0
8560,"POLYGON ((-13619131.420 4519518.650, -13619130...",06081605901,5398,6081606200,7788,San Mateo,94401,San Mateo,-122.322082,37.572721,...,45.2,76.432161,3.4,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,0.0
9128,"POLYGON ((-13149442.843 4000257.235, -13149307...",06037578100,1873,6037574700,133,Los Angeles,90815,Long Beach,-118.118648,33.778436,...,,,,,,,,,,0.0


In [11]:
# then concatenate the sjoined tracts with the unchanged ones
joined_df = pd.concat([original_df,mapped_df])
joined_df

Unnamed: 0,Census Tract,geometry,Total Population,California County,ZIP,Approximate Location,Longitude,Latitude,CES 4.0 Score,CES 4.0 Percentile,...,Unemployment Pctl,Housing Burden,Housing Burden Pctl,Pop. Char.,Pop. Char. Score,Pop. Char. Pctl,Census Tract_left,index_right,Census Tract_right,distances
0,6001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",3048,Alameda,94536,Fremont,-122.008111,37.537154,12.891690,20.196672,...,22.566372,3.3,0.392902,27.644782,2.867662,16.729702,,,,
1,6001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",2952,Alameda,94538,Fremont,-121.993103,37.529362,21.730953,41.616238,...,36.439355,9.9,13.561470,47.489498,4.926204,46.457388,,,,
2,6037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",2751,Los Angeles,90023,Los Angeles,-118.197497,34.017500,82.393909,99.974786,...,61.530453,20.3,63.967047,83.751814,8.687785,95.789208,,,,
3,6037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",3904,Los Angeles,90023,Los Angeles,-118.214298,34.024506,70.126624,99.104892,...,44.351900,28.5,87.883397,81.963948,8.502325,94.452849,,,,
4,6037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",3548,Los Angeles,90023,Los Angeles,-118.211796,34.018755,75.724241,99.747857,...,21.108798,26.1,82.788340,80.189551,8.318263,92.713061,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8552,,"POLYGON ((-13618295.631 4518847.147, -13618278...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,06081605902,5398.0,6.081606e+09,0.0
8553,,"POLYGON ((-13617244.772 4518845.876, -13617243...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,06081606202,5398.0,6.081606e+09,0.0
8560,,"POLYGON ((-13619131.420 4519518.650, -13619130...",7788,San Mateo,94401,San Mateo,-122.322082,37.572721,42.223423,78.340898,...,22.566372,21.8,69.911280,58.792134,6.098655,63.653555,06081605901,5398.0,6.081606e+09,0.0
9128,,"POLYGON ((-13149442.843 4000257.235, -13149307...",133,Los Angeles,90815,Long Beach,-118.118648,33.778436,,,...,,,,,,,06037578100,1873.0,6.037575e+09,0.0


In [12]:
# select relevant columns
metric_enviroscreen_data = enviroscreen_data[['Census Tract', 
                                              'California County', 
                                              'Total Population', 
                                                'Asthma',
                                                'Low Birth Weight', 
                                                'Cardiovascular Disease', 
                                                'Education', 
                                                'Linguistic Isolation',
                                                'Poverty',
                                                'Unemployment', 
                                                'Housing Burden', 
                                                'Imp. Water Bodies'
                                                ]]
calenviroscreen_2021 = joined_df[metric_enviroscreen_data.columns]
calenviroscreen_2021['Census Tract'] = calenviroscreen_2021['Census Tract'].astype(str)
calenviroscreen_2021 = calenviroscreen_2021.drop(columns={'California County'})

calenviroscreen_2021

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calenviroscreen_2021['Census Tract'] = calenviroscreen_2021['Census Tract'].astype(str)


Unnamed: 0,Census Tract,Total Population,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Imp. Water Bodies
0,6001442700,3048,34.20,5.91,9.84,2.6,2.5,5.1,3.4,3.3,0
1,6001442800,2952,42.72,7.05,12.38,10.5,9.8,10.1,4.4,9.9,0
2,6037204920,2751,76.10,7.11,20.87,52.2,17.1,62.6,6.4,20.3,7
3,6037205110,3904,76.10,5.68,20.87,60.7,26.3,60.8,5.0,28.5,7
4,6037205120,3548,76.10,5.88,20.87,59.0,22.4,82.3,3.3,26.1,7
...,...,...,...,...,...,...,...,...,...,...,...
8552,,7788,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,10
8553,,7788,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,10
8560,,7788,51.24,5.43,9.70,29.9,24.7,45.2,3.4,21.8,10
9128,,133,23.80,,12.66,,,,,,0


In [13]:
# Make some adjustments to the new df to ensure all census tracts are populated
calenviroscreen_2021['Census Tract'] = calenviroscreen_2021['Census Tract'].replace('nan', np.nan)
calenviroscreen_2021 = calenviroscreen_2021.dropna(subset=['Census Tract'])
calenviroscreen_2021

Unnamed: 0,Census Tract,Total Population,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Imp. Water Bodies
0,6001442700,3048,34.20,5.91,9.84,2.6,2.5,5.1,3.4,3.3,0
1,6001442800,2952,42.72,7.05,12.38,10.5,9.8,10.1,4.4,9.9,0
2,6037204920,2751,76.10,7.11,20.87,52.2,17.1,62.6,6.4,20.3,7
3,6037205110,3904,76.10,5.68,20.87,60.7,26.3,60.8,5.0,28.5,7
4,6037205120,3548,76.10,5.88,20.87,59.0,22.4,82.3,3.3,26.1,7
...,...,...,...,...,...,...,...,...,...,...,...
8028,6013366002,6627,107.76,5.65,18.28,30.6,19.9,41.3,7.9,14.8,11
8029,6059001303,5884,47.28,4.17,15.01,20.6,10.2,28.3,6.6,7.6,6
8030,6059001304,3982,46.82,4.96,14.86,26.9,15.8,46.9,14.7,17.7,6
8031,6059001401,4495,47.28,4.91,15.01,19.8,13.3,37.6,8.6,25.6,0


### Pulling in 2022 census population and county data and merging it with CalEnviroScreen data
* This bumps the entries from ~8,000 to ~9,000 with the addition of 2021 census tracts
* Extra tracts do not have CalEnviroScreen data
    * empty data within a metric recieve the average metric value from other populated tracts residing within that county
    * impaired watebody values are rounded as the final metric will represent number of impaired waterbodies

In [14]:
county_tract_pop = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2022.csv"
county_tract_pop = pd.read_csv(county_tract_pop)
county_tract_pop = county_tract_pop.rename(columns={'TRACT': 'Census Tract'})
county_tract_pop = county_tract_pop.drop(columns={'Unnamed: 0', 'COUNTYFP'}, axis=1)
county_tract_pop['Census Tract'] = county_tract_pop['Census Tract'].astype(str)
county_tract_pop

Unnamed: 0,Census Tract,County,Total Population 2021
0,6085504321,Santa Clara,5412
1,6085504410,Santa Clara,4124
2,6085507003,Santa Clara,3074
3,6085507004,Santa Clara,3926
4,6085502204,Santa Clara,3242
...,...,...,...
9124,6059001303,Orange,6515
9125,6059001304,Orange,3565
9126,6059001401,Orange,4756
9127,6013367200,Contra Costa,5869


In [15]:
# Column to check for common entries
column_to_check = 'Census Tract'

# Find the common entries between the two columns
common_entries = pd.Series(list(set(county_tract_pop[column_to_check]).intersection(set(calenviroscreen_2021[column_to_check]))))

# Count the number of common entries
common_count = len(common_entries)

print(f"Number of common entries in '{column_to_check}': {common_count}")

Number of common entries in 'Census Tract': 0


In [16]:
# Adjust entries in Census tract as they need string type and stripping whitespaces
county_tract_pop['Census Tract'] = county_tract_pop['Census Tract'].astype(str).str.strip()
calenviroscreen_2021['Census Tract'] = calenviroscreen_2021['Census Tract'].astype(str).str.strip()

# Merge with corrected data types and stripped whitespaces
merged_df = pd.merge(county_tract_pop, calenviroscreen_2021, on='Census Tract', how='left')

# Now check if there are NaN values in the merged DataFrame
print(merged_df.isna().sum())

merged_df

Census Tract                 0
County                       0
Total Population 2021        0
Total Population          2271
Asthma                    2281
Low Birth Weight          2451
Cardiovascular Disease    2281
Education                 2353
Linguistic Isolation      2538
Poverty                   2335
Unemployment              2547
Housing Burden            2376
Imp. Water Bodies         2271
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calenviroscreen_2021['Census Tract'] = calenviroscreen_2021['Census Tract'].astype(str).str.strip()


Unnamed: 0,Census Tract,County,Total Population 2021,Total Population,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Imp. Water Bodies
0,6085504321,Santa Clara,5412,5574.0,25.79,6.01,9.05,12.2,21.1,17.5,5.0,11.7,0.0
1,6085504410,Santa Clara,4124,4724.0,24.24,5.03,8.70,22.0,21.1,23.1,8.3,17.8,0.0
2,6085507003,Santa Clara,3074,,,,,,,,,,
3,6085507004,Santa Clara,3926,,,,,,,,,,
4,6085502204,Santa Clara,3242,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9124,6059001303,Orange,6515,5884.0,47.28,4.17,15.01,20.6,10.2,28.3,6.6,7.6,6.0
9125,6059001304,Orange,3565,3982.0,46.82,4.96,14.86,26.9,15.8,46.9,14.7,17.7,6.0
9126,6059001401,Orange,4756,4495.0,47.28,4.91,15.01,19.8,13.3,37.6,8.6,25.6,0.0
9127,6013367200,Contra Costa,5869,6042.0,107.76,6.39,18.28,24.6,10.5,37.1,6.0,20.7,2.0


## Now we need to fill NaN entries with average values from each desired metric per county

In [30]:
# Columns to fill NaN values
columns_to_fill = [
                    'Asthma',
                    'Low Birth Weight', 
                    'Cardiovascular Disease', 
                    'Education', 
                    'Linguistic Isolation',
                    'Poverty',
                    'Unemployment', 
                    'Housing Burden',
                    'Imp. Water Bodies'
                    ]

# Add a new column indicating whether a value was originally NaN
original_na_flag_column = 'Original_NA_Flag'
merged_df[original_na_flag_column] = np.where(merged_df[columns_to_fill].isna().any(axis=1), 1, 0)

# Compute average values for each column grouped by 'County'
average_values_by_county = merged_df.groupby('County')[columns_to_fill].transform('mean')

# Round the mean for 'Imp. Water Bodies' to the nearest whole number
average_values_by_county['Imp. Water Bodies'] = average_values_by_county['Imp. Water Bodies'].round()

# Fill NaN values in each column with the corresponding average value of that column for the respective 'County'
for column in columns_to_fill:
    na_mask = merged_df[column].isna()
    merged_df.loc[na_mask, column] = average_values_by_county.loc[na_mask, column]

print(len(merged_df))
merged_df.head(5)

9129


Unnamed: 0,Census Tract,County,Total Population 2021,Total Population 2019,Asthma,Low Birth Weight,Cardiovascular Disease,Education,Linguistic Isolation,Poverty,Unemployment,Housing Burden,Imp. Water Bodies,Original_NA_Flag
0,6085504321,Santa Clara,5412,5574.0,25.79,6.01,9.05,12.2,21.1,17.5,5.0,11.7,0.0,0
1,6085504410,Santa Clara,4124,4724.0,24.24,5.03,8.7,22.0,21.1,23.1,8.3,17.8,0.0,0
2,6085507003,Santa Clara,3074,,33.249401,5.023595,9.67988,12.325826,11.064134,17.793413,4.223708,14.783133,2.0,0
3,6085507004,Santa Clara,3926,,33.249401,5.023595,9.67988,12.325826,11.064134,17.793413,4.223708,14.783133,2.0,0
4,6085502204,Santa Clara,3242,,33.249401,5.023595,9.67988,12.325826,11.064134,17.793413,4.223708,14.783133,2.0,0


## Code below to check the averages per county
    * doesnt show the rounded impaired waterbodies however

In [18]:
# Prompt the user to input the county name
county_name = input("Enter the name of the county: ")

# Filter the dataframe for the specified county
county_data = merged_df[merged_df['County'] == county_name]

# Print out the average values for the specified county
print(f"Average values for {county_name}:")
for column in columns_to_fill:
    avg_value = county_data[column].mean()
    print(f"{column}: {avg_value}")


Average values for Fresno:
Asthma: 81.77613095238097
Low Birth Weight: 5.9941818181818185
Cardiovascular Disease: 13.860714285714288
Education: 24.634131736526946
Linguistic Isolation: 10.581987577639751
Poverty: 46.05357142857143
Unemployment: 9.042073170731708
Housing Burden: 18.93855421686747
Imp. Water Bodies: 0.64


Check how many missing entries within the 'County' column as it is the column we use to link missing tracts with other counties

In [19]:
missing_count = merged_df['County'].isna().sum()
print("Number of missing entries in the California County column:", missing_count)

Number of missing entries in the California County column: 0


In [26]:
merged_df = merged_df.rename(columns={'Total Population':'Total Population 2019'})
merged_df.to_csv('society_calenviroscreen_metric.csv', index=False)

### Function Call
The function below creates new df's for each metric listed below. Some metrics are already in percent from the 2019 data, so those columns are renamed and retained for Cal-CRAI metric. df's are saved as csv's named off of their metric column:

ones that are already in percent from 2019 data
* % of live, singleton births < 5.5 pounds (non-twin, including premature)
* % of population 25 and older with less than a high school education
* % of households where all members 14 and older have some difficult speaking English
* % of population living below 2x federal poverty level
* % of population > 16 years old unemployed and eligible for the workforce
* % of households which are low-income and housing-burdened

ones that have a sum we do not want as a percentage
* number of impaired waterbodies

The function can also calculate metric per 10,000 people for metrics that have a 'sum of' column rather than pre-baked in percentages:

metrics that have been calculated for metrics per 10,000 have columns for 2019 and 2021 populations
* Age-adjusted emergency department visits for asthma per 10,000 people
* Age-adjusted emergency department visits for myocardial infarction per 10,000 people

Asthma and cardiovascular percentage can be calculated with 2019 and 2021 as the CalEnviroscreen values are 'Age-adjusted rate of emergency department visits for asthma/cardiovascular disease'

# Calling function for both metric calc types

In [27]:
#@append_metadata
def calenviroscreen_metric_calc(input_csv, columns_to_process, calculate_per_10000=False, export=False, varname=""):
    '''
    Calculates the following metrics sourced from CalEnviroScreen:
    * % of live, singleton births < 5.5 pounds (non-twin, including premature)
    * % of population 25 and older with less than a high school education
    * % of households where all members 14 and older have some difficult speaking English
    * % of population living below 2x federal poverty level
    * % of population > 16 years old unemployed and eligible for the workforce
    * % of households which are low-income and housing-burdened
    * Age-adjusted emergency department visits for asthma per 10,000 people
    * Age-adjusted emergency department visits for myocardial infarction per 10,000 people
    * Number of impaired waterbodies

    Note
    --------
    Each of the above metrics is calculated separately; please see the corresponding 
    variable name (the same as the filename for this document) to know which one this 
    particular metadata document describes. 
  
    Methods
    --------
    Relevant data columns were isolated and renamed to align with Cal-CRAI metrics.
    Data was from older census tracts, so we merged it with 2017 California Tiger shape files first.
    The data was then set to Cal-CRAI standardized coordinate reference system.
    Data was then spatially joined to nearest 2021 census tract data.
    Extra tracts merged in were given the average value for each metric based on 
    the county they reside in.
    This averaging was also done for missing data in otherwise populated tracts.
    Metrics with % calculations were largely untouched as CalEnviroScreen data had
    those metrics calculated for 2019.
    Metrics with emergency department visits had their values adjusted to reflect
    number of visits per 10,000 people per tract with 2019 and 2021 population data.

    Parameters
    ------------
    columns_to_process: list
        list of columns that contain desired metric data
    calculate_per_10000: boolean
        if true, adds columns with calculations for # of visits per 10,000 people
        if false, retains the column but renames to 2019
    varname: string
        Final metric name.
    export: bool
        If True, uploads file to S3.
        If False, just generates metadata file.

    Script
    ------
    cal_enviroscreen_metrics.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    merged_df = pd.read_csv(input_csv)
        
    # List to store generated CSV file names
    csv_file_names = []
        
    for column in columns_to_process:
        # Create new DataFrame
        new_df = merged_df[['Census Tract', 'County', 'Total Population 2019', 'Total Population 2021']].copy()
        new_df = new_df.rename(columns={'Census Tract': 'census_tract'})  
        # Create new column name
        if column == 'Imp. Water Bodies':
            new_column_name = 'sum_' + column.replace(' ', '_').replace('.', '')
        else:
            new_column_name = column.replace(' ', '_')
            if calculate_per_10000:
                new_column_name += '_related_ED_visits_2019'
                new_column_name_per_10000_people_2019 = new_column_name.replace('_2019', '_per_10000_people_2019')
                new_column_name_per_10000_people_2021 = new_column_name.replace('_2019', '_per_10000_people_2021')
            else:
                new_column_name += '_percent_2019'

        # Lowercase the column name
        new_column_name = new_column_name.lower()
    
        # Add new column with the calculated name
        if not calculate_per_10000:
            new_df[new_column_name] = merged_df[column]
            new_df = new_df.drop(columns='Total Population 2021')

            print('Data transformation: adding calculation columns for metrics with emergency department visits.')

        else:
            new_df['Total Population 2021'] = merged_df['Total Population 2021']  # Only add this column if calculating percentage
            new_df[column] = merged_df[column]
            new_df[new_column_name_per_10000_people_2019] = (merged_df[column] / merged_df['Total Population 2019']) * 10000
            new_df[new_column_name_per_10000_people_2021] = (merged_df[column] / merged_df['Total Population 2021']) * 10000
            print('Data transformation: adding calculation columns for metrics with emergency department visits.')
        
        if not calculate_per_10000:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'
        else:
            # Define CSV file name based on the new column name
            csv_filename = 'society_vulnerable_' + column.replace(' ', '_').replace('.','').lower() + '_metric.csv'

        # Save the DataFrame to CSV
        new_df.columns = new_df.columns.str.lower()
        new_df.to_csv(csv_filename, index=False)
        
        print(f"Saved DataFrame to: {csv_filename}")
        # Append CSV filename to the list
        csv_file_names.append(csv_filename)
        # Output or further process new DataFrame
        display(new_df)
        
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        upload_csv_aws([csv_filename], bucket_name, directory)

    if export == False:
        print(f'{csv_filename} uploaded to AWS.')

In [32]:
input_csv = 'society_calenviroscreen_metric.csv'

# Lists of columns and varnames
columns_to_process_no_10000 = [
    'Low Birth Weight',
    'Education',
    'Linguistic Isolation',
    'Poverty',
    'Unemployment',
    'Housing Burden',
    'Imp. Water Bodies'
]
varnames_no_10000 = [
    'society_calenviroscreen_birth_weight', 
    'society_calenviroscreen_low_education', 
    'society_calenviroscreen_nonenglish_speakers',
    'society_calenviroscreen_below_poverty_level',
    'society_calenviroscreen_unemployment',
    'society_calenviroscreen_housing_burdened',
    'society_calenviroscreen_impaired_waterbodies'
]

# Calculate metric without percentages
for col, var in zip(columns_to_process_no_10000, varnames_no_10000):
    print(f"Processing {col} without percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=False, export=False, varname=var)

varnames_10000 = [
    'society_calenviroscreen_emergency_dept_visits',
    'society_calenviroscreen_emergency_dept_myocardial_visits'
]

# Columns to loop through that include calculating percentages
columns_to_process_per_10000 = [
    'Asthma',
    'Cardiovascular Disease'
]

# Calculate percentages
for col, var in zip(columns_to_process_per_10000, varnames_10000):
    print(f"Processing {col} with percentage calculation")
    calenviroscreen_metric_calc(input_csv, [col], calculate_per_10000=True, export=True, varname=var)

Processing Low Birth Weight without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_low_birth_weight_metric.csv


Unnamed: 0,census_tract,county,total population 2019,low_birth_weight_percent_2019
0,6085504321,Santa Clara,5574.0,6.010000
1,6085504410,Santa Clara,4724.0,5.030000
2,6085507003,Santa Clara,,5.023595
3,6085507004,Santa Clara,,5.023595
4,6085502204,Santa Clara,,5.023595
...,...,...,...,...
9124,6059001303,Orange,5884.0,4.170000
9125,6059001304,Orange,3982.0,4.960000
9126,6059001401,Orange,4495.0,4.910000
9127,6013367200,Contra Costa,6042.0,6.390000


society_vulnerable_low_birth_weight_metric.csv uploaded to AWS.
Processing Education without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_education_metric.csv


Unnamed: 0,census_tract,county,total population 2019,education_percent_2019
0,6085504321,Santa Clara,5574.0,12.200000
1,6085504410,Santa Clara,4724.0,22.000000
2,6085507003,Santa Clara,,12.325826
3,6085507004,Santa Clara,,12.325826
4,6085502204,Santa Clara,,12.325826
...,...,...,...,...
9124,6059001303,Orange,5884.0,20.600000
9125,6059001304,Orange,3982.0,26.900000
9126,6059001401,Orange,4495.0,19.800000
9127,6013367200,Contra Costa,6042.0,24.600000


society_vulnerable_education_metric.csv uploaded to AWS.
Processing Linguistic Isolation without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_linguistic_isolation_metric.csv


Unnamed: 0,census_tract,county,total population 2019,linguistic_isolation_percent_2019
0,6085504321,Santa Clara,5574.0,21.100000
1,6085504410,Santa Clara,4724.0,21.100000
2,6085507003,Santa Clara,,11.064134
3,6085507004,Santa Clara,,11.064134
4,6085502204,Santa Clara,,11.064134
...,...,...,...,...
9124,6059001303,Orange,5884.0,10.200000
9125,6059001304,Orange,3982.0,15.800000
9126,6059001401,Orange,4495.0,13.300000
9127,6013367200,Contra Costa,6042.0,10.500000


society_vulnerable_linguistic_isolation_metric.csv uploaded to AWS.
Processing Poverty without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_poverty_metric.csv


Unnamed: 0,census_tract,county,total population 2019,poverty_percent_2019
0,6085504321,Santa Clara,5574.0,17.500000
1,6085504410,Santa Clara,4724.0,23.100000
2,6085507003,Santa Clara,,17.793413
3,6085507004,Santa Clara,,17.793413
4,6085502204,Santa Clara,,17.793413
...,...,...,...,...
9124,6059001303,Orange,5884.0,28.300000
9125,6059001304,Orange,3982.0,46.900000
9126,6059001401,Orange,4495.0,37.600000
9127,6013367200,Contra Costa,6042.0,37.100000


society_vulnerable_poverty_metric.csv uploaded to AWS.
Processing Unemployment without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_unemployment_metric.csv


Unnamed: 0,census_tract,county,total population 2019,unemployment_percent_2019
0,6085504321,Santa Clara,5574.0,5.000000
1,6085504410,Santa Clara,4724.0,8.300000
2,6085507003,Santa Clara,,4.223708
3,6085507004,Santa Clara,,4.223708
4,6085502204,Santa Clara,,4.223708
...,...,...,...,...
9124,6059001303,Orange,5884.0,6.600000
9125,6059001304,Orange,3982.0,14.700000
9126,6059001401,Orange,4495.0,8.600000
9127,6013367200,Contra Costa,6042.0,6.000000


society_vulnerable_unemployment_metric.csv uploaded to AWS.
Processing Housing Burden without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_housing_burden_metric.csv


Unnamed: 0,census_tract,county,total population 2019,housing_burden_percent_2019
0,6085504321,Santa Clara,5574.0,11.700000
1,6085504410,Santa Clara,4724.0,17.800000
2,6085507003,Santa Clara,,14.783133
3,6085507004,Santa Clara,,14.783133
4,6085502204,Santa Clara,,14.783133
...,...,...,...,...
9124,6059001303,Orange,5884.0,7.600000
9125,6059001304,Orange,3982.0,17.700000
9126,6059001401,Orange,4495.0,25.600000
9127,6013367200,Contra Costa,6042.0,20.700000


society_vulnerable_housing_burden_metric.csv uploaded to AWS.
Processing Imp. Water Bodies without percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_imp_water_bodies_metric.csv


Unnamed: 0,census_tract,county,total population 2019,sum_imp_water_bodies
0,6085504321,Santa Clara,5574.0,0.0
1,6085504410,Santa Clara,4724.0,0.0
2,6085507003,Santa Clara,,2.0
3,6085507004,Santa Clara,,2.0
4,6085502204,Santa Clara,,2.0
...,...,...,...,...
9124,6059001303,Orange,5884.0,6.0
9125,6059001304,Orange,3982.0,6.0
9126,6059001401,Orange,4495.0,0.0
9127,6013367200,Contra Costa,6042.0,2.0


society_vulnerable_imp_water_bodies_metric.csv uploaded to AWS.
Processing Asthma with percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_asthma_metric.csv


Unnamed: 0,census_tract,county,total population 2019,total population 2021,asthma,asthma_related_ed_visits_per_10000_people_2019,asthma_related_ed_visits_per_10000_people_2021
0,6085504321,Santa Clara,5574.0,5412,25.790000,46.268389,47.653363
1,6085504410,Santa Clara,4724.0,4124,24.240000,51.312447,58.777886
2,6085507003,Santa Clara,,3074,33.249401,,108.163309
3,6085507004,Santa Clara,,3926,33.249401,,84.690273
4,6085502204,Santa Clara,,3242,33.249401,,102.558301
...,...,...,...,...,...,...,...
9124,6059001303,Orange,5884.0,6515,47.280000,80.353501,72.570990
9125,6059001304,Orange,3982.0,3565,46.820000,117.579106,131.332398
9126,6059001401,Orange,4495.0,4756,47.280000,105.183537,99.411270
9127,6013367200,Contra Costa,6042.0,5869,107.760000,178.351539,183.608792


society_vulnerable_asthma_metric.csv uploaded to AWS
Processing Cardiovascular Disease with percentage calculation
Data transformation: adding calculation columns for metrics with emergency department visits.
Saved DataFrame to: society_vulnerable_cardiovascular_disease_metric.csv


Unnamed: 0,census_tract,county,total population 2019,total population 2021,cardiovascular disease,cardiovascular_disease_related_ed_visits_per_10000_people_2019,cardiovascular_disease_related_ed_visits_per_10000_people_2021
0,6085504321,Santa Clara,5574.0,5412,9.050000,16.236096,16.722099
1,6085504410,Santa Clara,4724.0,4124,8.700000,18.416596,21.096023
2,6085507003,Santa Clara,,3074,9.679880,,31.489526
3,6085507004,Santa Clara,,3926,9.679880,,24.655834
4,6085502204,Santa Clara,,3242,9.679880,,29.857743
...,...,...,...,...,...,...,...
9124,6059001303,Orange,5884.0,6515,15.010000,25.509857,23.039140
9125,6059001304,Orange,3982.0,3565,14.860000,37.317931,41.683029
9126,6059001401,Orange,4495.0,4756,15.010000,33.392659,31.560135
9127,6013367200,Contra Costa,6042.0,5869,18.280000,30.254882,31.146703


society_vulnerable_cardiovascular_disease_metric.csv uploaded to AWS
