### Cal-CRAI Metric Calculation for: Economic Health
* Hachman Index

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties, data_stats_check
from scripts.utils.write_metadata import append_metadata

In [2]:
# Define a function to display data for a specific county
def display_county_data(df, county_col, county_name):
    county_data = df[df[county_col] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)  

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/society_economy/economic_health/bureau_labor_statistics/employment_hachman_index/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

aws_dir = '1_pull_data/society_economy/economic_health/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'employment_data_hachman_subset.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Column-Metadata.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Data.csv'
Saved DataFrame as 'ACSST5Y2022.S1901-Column-Metadata.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'ACSST5Y2022.S1901-Data.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'allhlcn22.csv'
Saved DataFrame as 'data_layout.csv'


In [4]:
hachman_data = pd.read_csv('employment_data_hachman_subset.csv')

In [5]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2022.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'Census Tract': 'GEO_ID'})
ca_county_tract = ca_county_tract.drop(columns={'Unnamed: 0', 'COUNTYFP', 'County', 'Total Population 2021'})

### Hachman Index Metric
$$
HI = \frac{1}{Σ_j (\ \left(\frac{E_si}{E_ri}\right) \cdot E_si)}
$$

{E_si} is share of area economic indicator in industry i -- county level <br>
{E_ri} is share of regions economic indicator in industry i -- state level

Several levels of data clean-up need to occur to calculate the Hachman Index. 
* Clean-up the County naming, dropping all statewide metrics
* Drop counts for "Service-providing" and "Goods-producing" -- these sum other columns! 

In [6]:
len(hachman_data)

1074

In [7]:
hachman_data.head(5)

Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.0,1.0
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.0
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.0


Drop the following rows:
* "101 Goods-producing" (it's a summary of the 101X categories)
* "102 Service-providing" (it's a summary of the 102X categories)

In [36]:
# create a Boolean mask for the rows to remove
mask101 = hachman_data['Industry'] == '101 Goods-producing'
mask102 = hachman_data['Industry'] == '102 Service-providing'

# select all rows except the ones that contain either
hachman_data_cleaned = hachman_data[~mask101]
hachman_data_cleaned = hachman_data_cleaned[~mask102]
hachman_data_cleaned

  hachman_data_cleaned = hachman_data_cleaned[~mask102]


Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.00,1.00
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.00
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1069,6999,6,999,5,1024,2022,A,County,California,"Unknown Or Undefined, California",Private,1024 Professional and business services,,26188,236341,25491212708,2074,107858,3.05,2.03
1070,6999,6,999,5,1025,2022,A,County,California,"Unknown Or Undefined, California",Private,1025 Education and health services,,4401,30314,2854407977,1811,94162,0.38,0.34
1071,6999,6,999,5,1026,2022,A,County,California,"Unknown Or Undefined, California",Private,1026 Leisure and hospitality,,2121,10276,932147336,1744,90713,0.19,0.33
1072,6999,6,999,5,1027,2022,A,County,California,"Unknown Or Undefined, California",Private,1027 Other services,N,3031,0,0,0,0,0.00,0.00


Taking a look at the entries within area
* separating California entries as we will need that data also
* getting rid of all 'County, California' portions of each entry

In [37]:
# Get unique entries in 'Column1'
unique_entries = hachman_data_cleaned['Area'].unique()
print(unique_entries)

['California -- Statewide' 'Alameda County, California'
 'Alpine County, California' 'Amador County, California'
 'Butte County, California' 'Calaveras County, California'
 'Colusa County, California' 'Contra Costa County, California'
 'Del Norte County, California' 'El Dorado County, California'
 'Fresno County, California' 'Glenn County, California'
 'Humboldt County, California' 'Imperial County, California'
 'Inyo County, California' 'Kern County, California'
 'Kings County, California' 'Lake County, California'
 'Lassen County, California' 'Los Angeles County, California'
 'Madera County, California' 'Marin County, California'
 'Mariposa County, California' 'Mendocino County, California'
 'Merced County, California' 'Modoc County, California'
 'Mono County, California' 'Monterey County, California'
 'Napa County, California' 'Nevada County, California'
 'Orange County, California' 'Placer County, California'
 'Plumas County, California' 'Riverside County, California'
 'Sacramento 

## Adjust the columns and entries within for future cleaning

In [38]:
hachman_data_cleaned = hachman_data_cleaned[['Area', 'Industry', 'Annual Average Employment']]
# Remove any mention of 'county' within the legalAgencyName column
hachman_data_cleaned.loc[:,'Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
hachman_data_cleaned.loc[:,'Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)

unique_entries = hachman_data_cleaned['Area'].unique()
hachman_data_cleaned = hachman_data_cleaned.rename(columns={'Area':'County'})

In [39]:
hachman_data_cleaned

Unnamed: 0,County,Industry,Annual Average Employment
0,California,"10 Total, all industries",17903539
1,California,"10 Total, all industries",247795
2,California,"10 Total, all industries",492218
3,California,"10 Total, all industries",1724972
4,California,"10 Total, all industries",15438555
...,...,...,...
1069,"Unknown Or Undefined, California",1024 Professional and business services,236341
1070,"Unknown Or Undefined, California",1025 Education and health services,30314
1071,"Unknown Or Undefined, California",1026 Leisure and hospitality,10276
1072,"Unknown Or Undefined, California",1027 Other services,0


## Eliminate any row with '10 Total, all industries' as their totals are inconsistent with observed values
* will calculate totals with the industry employment values

In [40]:
# Eliminate rows where the Industry column has the value '10 Total, all industries'
industry_clean_df = hachman_data_cleaned[hachman_data_cleaned['Industry'] != '10 Total, all industries']
industry_clean_df.loc[:,'Annual Average Employment'] = industry_clean_df['Annual Average Employment'].str.replace(',', '').astype(float)

industry_clean_df

Unnamed: 0,County,Industry,Annual Average Employment
6,California,1011 Natural resources and mining,436965.0
7,California,1012 Construction,912359.0
8,California,1013 Manufacturing,1335789.0
10,California,"1021 Trade, transportation, and utilities",3121930.0
11,California,1022 Information,605857.0
...,...,...,...
1069,"Unknown Or Undefined, California",1024 Professional and business services,236341.0
1070,"Unknown Or Undefined, California",1025 Education and health services,30314.0
1071,"Unknown Or Undefined, California",1026 Leisure and hospitality,10276.0
1072,"Unknown Or Undefined, California",1027 Other services,0.0


In [13]:
display_county_data(industry_clean_df, 'County', 'Alpine')

Data for Alpine:


Unnamed: 0,County,Industry,Annual Average Employment
42,Alpine,1012 Construction,0.0
43,Alpine,1013 Manufacturing,0.0
45,Alpine,"1021 Trade, transportation, and utilities",0.0
46,Alpine,1022 Information,0.0
47,Alpine,1023 Financial activities,25.0
48,Alpine,1024 Professional and business services,42.0
49,Alpine,1025 Education and health services,22.0
50,Alpine,1026 Leisure and hospitality,294.0
51,Alpine,1027 Other services,21.0


## Make a new df with the total employment for each county (and California as a whole)

In [14]:
total_emp_county = industry_clean_df.groupby('County')['Annual Average Employment'].sum().reset_index()
total_emp_county = total_emp_county.rename(columns={'Annual Average Employment': 'Total County Employed'})
total_emp_county.head(5)

Unnamed: 0,County,Total County Employed
0,Alameda,682308.0
1,Alpine,404.0
2,Amador,7545.0
3,Butte,63694.0
4,Calaveras,7788.0


## Run county filter on the data to isolate non-California counties from the dataset

In [15]:
filtered_hachman_data, omitted_data = filter_counties(industry_clean_df, 'County')

print('Counties kept:', len(filtered_hachman_data))
print('Omitted data entries:', len(omitted_data))

Counties kept: 633
Omitted data entries: 22


## Keep California data and place within its own df as it is needed to calculate reference values

In [16]:
# Get rid of the Unknown entries
mask = omitted_data['County'] == 'Unknown Or Undefined, California'
california_employ_data = omitted_data[~mask]
california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)

# Now we have a df that holds all CA state employment per industry
california_employ_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)


Unnamed: 0,State,Industry,Industry Employed CA
6,California,1011 Natural resources and mining,436965.0
7,California,1012 Construction,912359.0
8,California,1013 Manufacturing,1335789.0
10,California,"1021 Trade, transportation, and utilities",3121930.0
11,California,1022 Information,605857.0
12,California,1023 Financial activities,841564.0
13,California,1024 Professional and business services,2860674.0
14,California,1025 Education and health services,2854971.0
15,California,1026 Leisure and hospitality,1929160.0
16,California,1027 Other services,537319.0


In [17]:
filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)
filtered_hachman_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)


Unnamed: 0,County,Industry,Industry Employed County
24,Alameda,1011 Natural resources and mining,959.0
25,Alameda,1012 Construction,47512.0
26,Alameda,1013 Manufacturing,97583.0
28,Alameda,"1021 Trade, transportation, and utilities",135720.0
29,Alameda,1022 Information,18269.0
...,...,...,...
1051,Yuba,1024 Professional and business services,1265.0
1052,Yuba,1025 Education and health services,3615.0
1053,Yuba,1026 Leisure and hospitality,1893.0
1054,Yuba,1027 Other services,194.0


In [18]:
display_county_data(industry_clean_df, 'County', 'Alpine')

Data for Alpine:


Unnamed: 0,County,Industry,Annual Average Employment
42,Alpine,1012 Construction,0.0
43,Alpine,1013 Manufacturing,0.0
45,Alpine,"1021 Trade, transportation, and utilities",0.0
46,Alpine,1022 Information,0.0
47,Alpine,1023 Financial activities,25.0
48,Alpine,1024 Professional and business services,42.0
49,Alpine,1025 Education and health services,22.0
50,Alpine,1026 Leisure and hospitality,294.0
51,Alpine,1027 Other services,21.0


## Merge the two datasets together based on 'Industry' so we have a single df with county and state employment per industry

In [19]:
merge_data = pd.merge(filtered_hachman_data, california_employ_data, on='Industry', how='left')
merge_data

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA
0,Alameda,1011 Natural resources and mining,959.0,California,436965.0
1,Alameda,1012 Construction,47512.0,California,912359.0
2,Alameda,1013 Manufacturing,97583.0,California,1335789.0
3,Alameda,"1021 Trade, transportation, and utilities",135720.0,California,3121930.0
4,Alameda,1022 Information,18269.0,California,605857.0
...,...,...,...,...,...
628,Yuba,1024 Professional and business services,1265.0,California,2860674.0
629,Yuba,1025 Education and health services,3615.0,California,2854971.0
630,Yuba,1026 Leisure and hospitality,1893.0,California,1929160.0
631,Yuba,1027 Other services,194.0,California,537319.0


## Merge once again with the new df and the total employment values per county
* now we have a df with employment per industry for the state and its counties
* also have total employees per county across all industries

In [20]:
hachman_denominator = pd.merge(merge_data, total_emp_county, on='County', how='left')

new_column_order = ['County', 
                    'Industry', 
                    'Industry Employed County',
                    'Total County Employed', 
                    'State', 
                    'Industry Employed CA']

# Rearrange the DataFrame columns
hachman_denominator = hachman_denominator[new_column_order]
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,Total County Employed,State,Industry Employed CA
0,Alameda,1011 Natural resources and mining,959.0,682308.0,California,436965.0
1,Alameda,1012 Construction,47512.0,682308.0,California,912359.0
2,Alameda,1013 Manufacturing,97583.0,682308.0,California,1335789.0
3,Alameda,"1021 Trade, transportation, and utilities",135720.0,682308.0,California,3121930.0
4,Alameda,1022 Information,18269.0,682308.0,California,605857.0
...,...,...,...,...,...,...
628,Yuba,1024 Professional and business services,1265.0,12264.0,California,2860674.0
629,Yuba,1025 Education and health services,3615.0,12264.0,California,2854971.0
630,Yuba,1026 Leisure and hospitality,1893.0,12264.0,California,1929160.0
631,Yuba,1027 Other services,194.0,12264.0,California,537319.0


## Add another column with the total number of employed in California

In [21]:
hachman_denominator['Total State Employment'] = 15438558.0 
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,Total County Employed,State,Industry Employed CA,Total State Employment
0,Alameda,1011 Natural resources and mining,959.0,682308.0,California,436965.0,15438558.0
1,Alameda,1012 Construction,47512.0,682308.0,California,912359.0,15438558.0
2,Alameda,1013 Manufacturing,97583.0,682308.0,California,1335789.0,15438558.0
3,Alameda,"1021 Trade, transportation, and utilities",135720.0,682308.0,California,3121930.0,15438558.0
4,Alameda,1022 Information,18269.0,682308.0,California,605857.0,15438558.0
...,...,...,...,...,...,...,...
628,Yuba,1024 Professional and business services,1265.0,12264.0,California,2860674.0,15438558.0
629,Yuba,1025 Education and health services,3615.0,12264.0,California,2854971.0,15438558.0
630,Yuba,1026 Leisure and hospitality,1893.0,12264.0,California,1929160.0,15438558.0
631,Yuba,1027 Other services,194.0,12264.0,California,537319.0,15438558.0


Calculate the Esi and Eri values
* Esi = (county employment in industry i / total county employment for all industries)
* Eri = (state employment in industry i / total state employment for all industries)

Then we divide Esi by Eri, and multiply by Esi to create our hachman denominator column

In [22]:
# county score per industry -- fraction of county employment in industry
hachman_denominator['county_industry_frac'] = hachman_denominator['Industry Employed County'] / hachman_denominator['Total County Employed']

# state score per industry -- fraction of state employment in industry
hachman_denominator['state_industry_frac'] = hachman_denominator['Industry Employed CA'] / hachman_denominator['Total State Employment']
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,Total County Employed,State,Industry Employed CA,Total State Employment,county_industry_frac,state_industry_frac
0,Alameda,1011 Natural resources and mining,959.0,682308.0,California,436965.0,15438558.0,0.001406,0.028303
1,Alameda,1012 Construction,47512.0,682308.0,California,912359.0,15438558.0,0.069634,0.059096
2,Alameda,1013 Manufacturing,97583.0,682308.0,California,1335789.0,15438558.0,0.143019,0.086523
3,Alameda,"1021 Trade, transportation, and utilities",135720.0,682308.0,California,3121930.0,15438558.0,0.198913,0.202216
4,Alameda,1022 Information,18269.0,682308.0,California,605857.0,15438558.0,0.026775,0.039243
...,...,...,...,...,...,...,...,...,...
628,Yuba,1024 Professional and business services,1265.0,12264.0,California,2860674.0,15438558.0,0.103147,0.185294
629,Yuba,1025 Education and health services,3615.0,12264.0,California,2854971.0,15438558.0,0.294765,0.184925
630,Yuba,1026 Leisure and hospitality,1893.0,12264.0,California,1929160.0,15438558.0,0.154354,0.124957
631,Yuba,1027 Other services,194.0,12264.0,California,537319.0,15438558.0,0.015819,0.034804


## Calculate the Hachman denominator score per industry

In [23]:
hachman_denominator['scores'] = (hachman_denominator['county_industry_frac'] / hachman_denominator['state_industry_frac']) * hachman_denominator['county_industry_frac']
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,Total County Employed,State,Industry Employed CA,Total State Employment,county_industry_frac,state_industry_frac,scores
0,Alameda,1011 Natural resources and mining,959.0,682308.0,California,436965.0,15438558.0,0.001406,0.028303,0.000070
1,Alameda,1012 Construction,47512.0,682308.0,California,912359.0,15438558.0,0.069634,0.059096,0.082052
2,Alameda,1013 Manufacturing,97583.0,682308.0,California,1335789.0,15438558.0,0.143019,0.086523,0.236405
3,Alameda,"1021 Trade, transportation, and utilities",135720.0,682308.0,California,3121930.0,15438558.0,0.198913,0.202216,0.195664
4,Alameda,1022 Information,18269.0,682308.0,California,605857.0,15438558.0,0.026775,0.039243,0.018269
...,...,...,...,...,...,...,...,...,...,...
628,Yuba,1024 Professional and business services,1265.0,12264.0,California,2860674.0,15438558.0,0.103147,0.185294,0.057419
629,Yuba,1025 Education and health services,3615.0,12264.0,California,2854971.0,15438558.0,0.294765,0.184925,0.469848
630,Yuba,1026 Leisure and hospitality,1893.0,12264.0,California,1929160.0,15438558.0,0.154354,0.124957,0.190667
631,Yuba,1027 Other services,194.0,12264.0,California,537319.0,15438558.0,0.015819,0.034804,0.007190


In [24]:
hachman_denominator.loc[hachman_denominator.County == 'Sierra']

Unnamed: 0,County,Industry,Industry Employed County,Total County Employed,State,Industry Employed CA,Total State Employment,county_industry_frac,state_industry_frac,scores
491,Sierra,1011 Natural resources and mining,0.0,243.0,California,436965.0,15438558.0,0.0,0.028303,0.0
492,Sierra,1012 Construction,28.0,243.0,California,912359.0,15438558.0,0.115226,0.059096,0.22467
493,Sierra,1013 Manufacturing,0.0,243.0,California,1335789.0,15438558.0,0.0,0.086523,0.0
494,Sierra,"1021 Trade, transportation, and utilities",38.0,243.0,California,3121930.0,15438558.0,0.156379,0.202216,0.120931
495,Sierra,1022 Information,0.0,243.0,California,605857.0,15438558.0,0.0,0.039243,0.0
496,Sierra,1023 Financial activities,0.0,243.0,California,841564.0,15438558.0,0.0,0.054511,0.0
497,Sierra,1024 Professional and business services,10.0,243.0,California,2860674.0,15438558.0,0.041152,0.185294,0.00914
498,Sierra,1025 Education and health services,76.0,243.0,California,2854971.0,15438558.0,0.312757,0.184925,0.528956
499,Sierra,1026 Leisure and hospitality,88.0,243.0,California,1929160.0,15438558.0,0.36214,0.124957,1.049521
500,Sierra,1027 Other services,3.0,243.0,California,537319.0,15438558.0,0.012346,0.034804,0.004379


## Now we sum the Hachman denominator industry values together per county

In [41]:
hachman_denominator_sum = hachman_denominator.groupby('County')['scores'].sum().reset_index()
hachman_denominator_sum.head()

Unnamed: 0,County,scores
0,Alameda,1.079017
1,Alpine,4.460338
2,Amador,1.215431
3,Butte,1.16825
4,Calaveras,1.239384


## Take the reciprical for each county score to get the final Hachman index value

In [26]:
hachman_denominator_sum['hachman_index'] = 1 / hachman_denominator_sum['scores']
hachman_denominator_sum.columns = hachman_denominator_sum.columns.str.lower()
hachman_denominator_sum = hachman_denominator_sum.applymap(lambda s: s.lower() if type(s) == str else s)
hachman_denominator_sum

Unnamed: 0,county,scores,hachman_index
0,alameda,1.079017,0.926769
1,alpine,4.460338,0.224198
2,amador,1.215431,0.822754
3,butte,1.16825,0.855981
4,calaveras,1.239384,0.806853
5,colusa,5.351706,0.186856
6,contra costa,1.086963,0.919995
7,del norte,1.375589,0.726961
8,el dorado,1.204412,0.83028
9,fresno,1.377575,0.725913


In [27]:
# should not exceed 1
data_stats_check(hachman_denominator_sum, 'hachman_index')

Calculating stats on hachman_index...
Data min:  0.18685629935584028
Data max:  0.9557300478674501
Data mean:  0.6905868625224101




## Merge with California census tracts

In [28]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

Unnamed: 0,tract,county
0,06085504321,santa clara
1,06085504410,santa clara
2,06085507003,santa clara
3,06085507004,santa clara
4,06085502204,santa clara
...,...,...
9124,06059001303,orange
9125,06059001304,orange
9126,06059001401,orange
9127,06013367200,contra costa


In [29]:
hachman_metric = pd.merge(ca_tract_county, hachman_denominator_sum, on='county', how='left')
hachman_metric

Unnamed: 0,tract,county,scores,hachman_index
0,06085504321,santa clara,1.267918,0.788695
1,06085504410,santa clara,1.267918,0.788695
2,06085507003,santa clara,1.267918,0.788695
3,06085507004,santa clara,1.267918,0.788695
4,06085502204,santa clara,1.267918,0.788695
...,...,...,...,...
9124,06059001303,orange,1.070369,0.934257
9125,06059001304,orange,1.070369,0.934257
9126,06059001401,orange,1.070369,0.934257
9127,06013367200,contra costa,1.086963,0.919995


In [30]:
display_county_data(hachman_metric, 'county', 'yuba')

Data for yuba:


Unnamed: 0,tract,county,scores,hachman_index
2013,6115040500,yuba,1.275293,0.784133
2014,6115040600,yuba,1.275293,0.784133
2936,6115040100,yuba,1.275293,0.784133
2937,6115040800,yuba,1.275293,0.784133
2938,6115040902,yuba,1.275293,0.784133
2939,6115040901,yuba,1.275293,0.784133
2940,6115040400,yuba,1.275293,0.784133
4236,6115040301,yuba,1.275293,0.784133
4303,6115040302,yuba,1.275293,0.784133
5835,6115041102,yuba,1.275293,0.784133


In [42]:
# Save as a csv for function call
hachman_metric.to_csv('society_economy_hachman_metric.csv', index=False)

## Function Call

In [32]:
@append_metadata
def hachman_index_upload(input_csv, export=False, varname=''):
    '''
    Uploads the calculated Public Safety Power Shutoff (PSPS) metric to S3 bucket. The metrics is:
    Frequency of PSPS events per California census tract.

    Data for this metric was sourced from the US Bureau of Labor Statistics at:
    https://www.bls.gov/cew/downloadable-data-files.htm

    Hachman Index methodology was followed from:
    https://d36oiwf74r1rap.cloudfront.net/wp-content/uploads/ERG-Hachman-RB-Mar2023.pdf

    Methods
    -------
    Data Collection: 
    Gathered employment data at the industry level for California and its counties.

    Hachman Score Calculation:
    Calculated new total employment values based on industry-specific employment data due to inconsistencies in source data.
    Computed each county's employment per industry as a proportion of its total employment.
    Computed California's employment per industry as a proportion of its total employment.
    Divided each county's industry employment proportion by California's corresponding proportion, then multiplied by the county's proportion.
    Summed these values for each county.
    Took the reciprocal of each county's sum to obtain the Hachman Index value.

    Integration with Census Data:
    Merged the Hachman Index values with 2021 California Census data to assign each census tract the Hachman value of its county.

    Parameters
    ----------
    input_csv: string
        csv Hachman calculated data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI Hachman metric to AWS
        True = will upload resulting df containing CAL CRAI Hachman metric to AWS

    Script
    ------
    society_economic_metrics_hachman.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are 
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: New total employment values calculated for California and its counties.')
    print('Data transformation: Dropped the following columns as they summarized counts from other industries: 101 Goods-producing and 102 Service-providing')
    print('Data transformation: Removed unknown or uncategorized entries within the county column.')
    print('Data transformation: Isolated relevant columns and created new ones resulting for Hachman calculations.')
    print('Data transformation: Resulting Hachman calculation per county was extrapolated to California census tracts.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
'''   if os.path.exists(input_csv):
        os.remove(input_csv)'''

'   if os.path.exists(input_csv):\n        os.remove(input_csv)'

In [33]:
input_csv = 'society_economy_hachman_metric.csv'
varname = 'society_bls_hachman'

hachman_index_upload(input_csv, export=True, varname='test') #varname)