### Cal-CRAI Metric Calculation for: Economic Health
* Hachman Index

In [1]:
import pandas as pd
import os
import sys
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# Define a function to display data for a specific county
def display_county_data(df, county_name):
    county_data = df[df['County'] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)  

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/society_economy/economic_health/bureau_labor_statistics/employment_hachman_index/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

aws_dir = '1_pull_data/society_economy/economic_health/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'employment_data_hachman_subset.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Column-Metadata.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Data.csv'
Saved DataFrame as 'ACSST5Y2022.S1901-Column-Metadata.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'ACSST5Y2022.S1901-Data.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'allhlcn22.csv'
Saved DataFrame as 'data_layout.csv'


In [4]:
hachman_data = pd.read_csv('employment_data_hachman_subset.csv')

In [5]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2022.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'Census Tract': 'GEO_ID'})
ca_county_tract = ca_county_tract.drop(columns={'Unnamed: 0', 'COUNTYFP', 'County', 'Total Population 2021'})

### Hachman Index Metric
$$
HI = \frac{1}{Σ_j (\ \left(\frac{E_si}{E_ri}\right) \cdot E_si)}
$$

{E_si} is share of area economic indicator in industry i -- county level <br>
{E_ri} is share of regions economic indicator in industry i -- state level

Several levels of data clean-up need to occur to calculate the Hachman Index. 
* Clean-up the County naming, dropping all statewide metrics
* Drop counts for "Service-providing" and "Goods-producing" -- these sum other columns! 

In [29]:
hachman_data

Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.00,1.00
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.00
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1069,6999,6,999,5,1024,2022,A,County,California,"Unknown Or Undefined, California",Private,1024 Professional and business services,,26188,236341,25491212708,2074,107858,3.05,2.03
1070,6999,6,999,5,1025,2022,A,County,California,"Unknown Or Undefined, California",Private,1025 Education and health services,,4401,30314,2854407977,1811,94162,0.38,0.34
1071,6999,6,999,5,1026,2022,A,County,California,"Unknown Or Undefined, California",Private,1026 Leisure and hospitality,,2121,10276,932147336,1744,90713,0.19,0.33
1072,6999,6,999,5,1027,2022,A,County,California,"Unknown Or Undefined, California",Private,1027 Other services,N,3031,0,0,0,0,0.00,0.00


In [6]:
hachman_data.head()

Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.0,1.0
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.0
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.0


Taking a look at the entries within area
* separating California entries as we will need that data also
* getting rid of all 'County, California' portions of each entry

In [7]:
# Get unique entries in 'Column1'
unique_entries = hachman_data['Area'].unique()
#print(unique_entries)

In [8]:
hachman_data_cleaned = hachman_data[['Area', 'Industry', 'Annual Average Employment']]
# Remove any mention of 'county' within the legalAgencyName column
hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)

unique_entries = hachman_data_cleaned['Area'].unique()
hachman_data_cleaned = hachman_data_cleaned.rename(columns={'Area':'County'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)


In [9]:
hachman_data_cleaned

Unnamed: 0,County,Industry,Annual Average Employment
0,California,"10 Total, all industries",17903539
1,California,"10 Total, all industries",247795
2,California,"10 Total, all industries",492218
3,California,"10 Total, all industries",1724972
4,California,"10 Total, all industries",15438555
...,...,...,...
1069,"Unknown Or Undefined, California",1024 Professional and business services,236341
1070,"Unknown Or Undefined, California",1025 Education and health services,30314
1071,"Unknown Or Undefined, California",1026 Leisure and hospitality,10276
1072,"Unknown Or Undefined, California",1027 Other services,0


In [10]:
display_county_data(hachman_data_cleaned, 'Alameda')

Data for Alameda:


Unnamed: 0,County,Industry,Annual Average Employment
18,Alameda,"10 Total, all industries",787020
19,Alameda,"10 Total, all industries",8304
20,Alameda,"10 Total, all industries",25618
21,Alameda,"10 Total, all industries",70792
22,Alameda,"10 Total, all industries",682306
23,Alameda,101 Goods-producing,146054
24,Alameda,1011 Natural resources and mining,959
25,Alameda,1012 Construction,47512
26,Alameda,1013 Manufacturing,97583
27,Alameda,102 Service-providing,536253


There are multiple entries of '10 Total, all industries' within the Industry column for each county (and CA as a whole)
* the first entry per county is the largest and sum of all others, indicating total employment for that county
* the code below retains the first/largest number of this bunch and makes a new df containing these values
which will be used in the Hachman index further below

In [11]:
df = pd.DataFrame(hachman_data_cleaned)

# Filter for '10 Total, all industries' in the 'Industry' column
df_filtered = df[df['Industry'] == '10 Total, all industries']

# Remove commas and convert 'Annual Average Employment' to numeric
df_filtered['Total County Employment'] = df_filtered['Annual Average Employment'].str.replace(',', '').astype(float)

# Group by Area and get the row with the max 'Annual Average Employment'
max_area_values = df_filtered.loc[df_filtered.groupby('County')['Total County Employment'].idxmax()]
max_area_values = max_area_values.drop(columns=['Industry','Annual Average Employment'])
max_area_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Total County Employment'] = df_filtered['Annual Average Employment'].str.replace(',', '').astype(float)


Unnamed: 0,County,Total County Employment
18,Alameda,787020.0
36,Alpine,668.0
52,Amador,12114.0
70,Butte,78536.0
88,Calaveras,10626.0
0,California,17903539.0
106,Colusa,8901.0
124,Contra Costa,367582.0
142,Del Norte,8104.0
160,El Dorado,57308.0


Now that we have a new df with the total employments, we can get rid of those entries within our main df

In [12]:
# create a Boolean mask for the rows to remove
mask = hachman_data_cleaned['Industry'] == '10 Total, all industries'
# select all rows except the ones that contain '10 Total, all industries' which are state and county employment totals, not needed
hachman_data_cleaned = hachman_data_cleaned[~mask]

Running our county function filter to separate CA counties and other entries

In [13]:
filtered_hachman_data, omitted_data = filter_counties(hachman_data_cleaned, 'County')

print('Counties kept:', len(filtered_hachman_data))
print('Omitted data entries:', len(omitted_data))

Counties kept: 749
Omitted data entries: 26


In [14]:
filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)
filtered_hachman_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)


Unnamed: 0,County,Industry,Industry Employed County
23,Alameda,101 Goods-producing,146054
24,Alameda,1011 Natural resources and mining,959
25,Alameda,1012 Construction,47512
26,Alameda,1013 Manufacturing,97583
27,Alameda,102 Service-providing,536253
...,...,...,...
1051,Yuba,1024 Professional and business services,1265
1052,Yuba,1025 Education and health services,3615
1053,Yuba,1026 Leisure and hospitality,1893
1054,Yuba,1027 Other services,194


Looking at the other entries, will isolate for just California data, which will be used in the Hachman calculation

In [15]:
omitted_data

Unnamed: 0,County,Industry,Annual Average Employment
5,California,101 Goods-producing,2685113
6,California,1011 Natural resources and mining,436965
7,California,1012 Construction,912359
8,California,1013 Manufacturing,1335789
9,California,102 Service-providing,12753442
10,California,"1021 Trade, transportation, and utilities",3121930
11,California,1022 Information,605857
12,California,1023 Financial activities,841564
13,California,1024 Professional and business services,2860674
14,California,1025 Education and health services,2854971


In [16]:
# Get rid of the Unknown entries
mask = omitted_data['County'] == 'Unknown Or Undefined, California'
california_employ_data = omitted_data[~mask]
california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)

# Now we have a df that holds all CA state employment per industry
california_employ_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)


Unnamed: 0,State,Industry,Industry Employed CA
5,California,101 Goods-producing,2685113
6,California,1011 Natural resources and mining,436965
7,California,1012 Construction,912359
8,California,1013 Manufacturing,1335789
9,California,102 Service-providing,12753442
10,California,"1021 Trade, transportation, and utilities",3121930
11,California,1022 Information,605857
12,California,1023 Financial activities,841564
13,California,1024 Professional and business services,2860674
14,California,1025 Education and health services,2854971


Merge the county and california employment datasets together based on industry so we have county employment and state employment per industry

In [17]:
# Merge county industry employment with state industry employment
merged_data = filtered_hachman_data.merge(california_employ_data, on='Industry')
merged_data

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA
0,Alameda,101 Goods-producing,146054,California,2685113
1,Alpine,101 Goods-producing,13,California,2685113
2,Amador,101 Goods-producing,1865,California,2685113
3,Butte,101 Goods-producing,11325,California,2685113
4,Calaveras,101 Goods-producing,1500,California,2685113
...,...,...,...,...,...
744,Tulare,1029 Unclassified,5,California,1970
745,Tuolumne,1029 Unclassified,0,California,1970
746,Ventura,1029 Unclassified,17,California,1970
747,Yolo,1029 Unclassified,11,California,1970


Merge the result to the total employment values based on county
* result gives us an additional column with total county values

In [18]:
further_merged_data = merged_data.merge(max_area_values, on='County')
further_merged_data

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA,Total County Employment
0,Alameda,101 Goods-producing,146054,California,2685113,787020.0
1,Alameda,1011 Natural resources and mining,959,California,436965,787020.0
2,Alameda,1012 Construction,47512,California,912359,787020.0
3,Alameda,1013 Manufacturing,97583,California,1335789,787020.0
4,Alameda,102 Service-providing,536253,California,12753442,787020.0
...,...,...,...,...,...,...
744,Yuba,1024 Professional and business services,1265,California,2860674,20314.0
745,Yuba,1025 Education and health services,3615,California,2854971,20314.0
746,Yuba,1026 Leisure and hospitality,1893,California,1929160,20314.0
747,Yuba,1027 Other services,194,California,537319,20314.0


Add another column with the total employment value in California

In [19]:
hachman_denominator = pd.DataFrame(further_merged_data)
hachman_denominator['Total State Employment'] = 17903539.0

hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA,Total County Employment,Total State Employment
0,Alameda,101 Goods-producing,146054,California,2685113,787020.0,17903539.0
1,Alameda,1011 Natural resources and mining,959,California,436965,787020.0,17903539.0
2,Alameda,1012 Construction,47512,California,912359,787020.0,17903539.0
3,Alameda,1013 Manufacturing,97583,California,1335789,787020.0,17903539.0
4,Alameda,102 Service-providing,536253,California,12753442,787020.0,17903539.0
...,...,...,...,...,...,...,...
744,Yuba,1024 Professional and business services,1265,California,2860674,20314.0,17903539.0
745,Yuba,1025 Education and health services,3615,California,2854971,20314.0,17903539.0
746,Yuba,1026 Leisure and hospitality,1893,California,1929160,20314.0,17903539.0
747,Yuba,1027 Other services,194,California,537319,20314.0,17903539.0


Calculate the Esi and Eri values
* Esi = (county employment in industry i / total county employment for all industries)
* Eri = (state employment in industry i / total state employment for all industries)

Then we divide Esi by Eri, and multiply by Esi to create our hachman denominator column

In [20]:
hachman_denominator['Industry Employed CA'] = hachman_denominator['Industry Employed CA'].str.replace(',', '').astype(float)
hachman_denominator['Industry Employed County'] = hachman_denominator['Industry Employed County'].str.replace(',', '').astype(float)

In [22]:
# county score per industry -- fraction of county employment in industry
hachman_denominator['county_industry_frac'] = hachman_denominator['Industry Employed County'] / hachman_denominator['Total County Employment']

# state score per industry -- fraction of state employment in industry
hachman_denominator['state_industry_frac'] = hachman_denominator['Industry Employed CA'] / hachman_denominator['Total State Employment']
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA,Total County Employment,Total State Employment,county_industry_frac,state_industry_frac
0,Alameda,101 Goods-producing,146054.0,California,2685113.0,787020.0,17903539.0,0.185579,0.149977
1,Alameda,1011 Natural resources and mining,959.0,California,436965.0,787020.0,17903539.0,0.001219,0.024407
2,Alameda,1012 Construction,47512.0,California,912359.0,787020.0,17903539.0,0.060369,0.050960
3,Alameda,1013 Manufacturing,97583.0,California,1335789.0,787020.0,17903539.0,0.123990,0.074610
4,Alameda,102 Service-providing,536253.0,California,12753442.0,787020.0,17903539.0,0.681372,0.712342
...,...,...,...,...,...,...,...,...,...
744,Yuba,1024 Professional and business services,1265.0,California,2860674.0,20314.0,17903539.0,0.062272,0.159783
745,Yuba,1025 Education and health services,3615.0,California,2854971.0,20314.0,17903539.0,0.177956,0.159464
746,Yuba,1026 Leisure and hospitality,1893.0,California,1929160.0,20314.0,17903539.0,0.093187,0.107753
747,Yuba,1027 Other services,194.0,California,537319.0,20314.0,17903539.0,0.009550,0.030012


In [23]:
hachman_denominator['scores'] = (hachman_denominator['county_industry_frac'] / hachman_denominator['state_industry_frac']) * hachman_denominator['county_industry_frac']
hachman_denominator

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA,Total County Employment,Total State Employment,county_industry_frac,state_industry_frac,scores
0,Alameda,101 Goods-producing,146054.0,California,2685113.0,787020.0,17903539.0,0.185579,0.149977,0.229632
1,Alameda,1011 Natural resources and mining,959.0,California,436965.0,787020.0,17903539.0,0.001219,0.024407,0.000061
2,Alameda,1012 Construction,47512.0,California,912359.0,787020.0,17903539.0,0.060369,0.050960,0.071517
3,Alameda,1013 Manufacturing,97583.0,California,1335789.0,787020.0,17903539.0,0.123990,0.074610,0.206052
4,Alameda,102 Service-providing,536253.0,California,12753442.0,787020.0,17903539.0,0.681372,0.712342,0.651748
...,...,...,...,...,...,...,...,...,...,...
744,Yuba,1024 Professional and business services,1265.0,California,2860674.0,20314.0,17903539.0,0.062272,0.159783,0.024269
745,Yuba,1025 Education and health services,3615.0,California,2854971.0,20314.0,17903539.0,0.177956,0.159464,0.198593
746,Yuba,1026 Leisure and hospitality,1893.0,California,1929160.0,20314.0,17903539.0,0.093187,0.107753,0.080590
747,Yuba,1027 Other services,194.0,California,537319.0,20314.0,17903539.0,0.009550,0.030012,0.003039


In [28]:
hachman_denominator.loc[hachman_denominator.County == 'Sierra']

Unnamed: 0,County,Industry,Industry Employed County,State,Industry Employed CA,Total County Employment,Total State Employment,county_industry_frac,state_industry_frac,scores
581,Sierra,101 Goods-producing,50.0,California,2685113.0,562.0,17903539.0,0.088968,0.149977,0.052777
582,Sierra,1011 Natural resources and mining,0.0,California,436965.0,562.0,17903539.0,0.0,0.024407,0.0
583,Sierra,1012 Construction,28.0,California,912359.0,562.0,17903539.0,0.049822,0.05096,0.04871
584,Sierra,1013 Manufacturing,0.0,California,1335789.0,562.0,17903539.0,0.0,0.07461,0.0
585,Sierra,102 Service-providing,216.0,California,12753442.0,562.0,17903539.0,0.384342,0.712342,0.20737
586,Sierra,"1021 Trade, transportation, and utilities",38.0,California,3121930.0,562.0,17903539.0,0.067616,0.174375,0.026219
587,Sierra,1022 Information,0.0,California,605857.0,562.0,17903539.0,0.0,0.03384,0.0
588,Sierra,1023 Financial activities,0.0,California,841564.0,562.0,17903539.0,0.0,0.047005,0.0
589,Sierra,1024 Professional and business services,10.0,California,2860674.0,562.0,17903539.0,0.017794,0.159783,0.001982
590,Sierra,1025 Education and health services,76.0,California,2854971.0,562.0,17903539.0,0.135231,0.159464,0.114681


Now we sum the Hachman denominator values together per county
* Group by the county and sum Hachman denominator

In [24]:
hachman_denominator_sum = hachman_denominator.groupby('County')['scores'].sum().reset_index()
hachman_denominator_sum

Unnamed: 0,County,scores
0,Alameda,1.82186
1,Alpine,2.470772
2,Amador,1.013326
3,Butte,1.653901
4,Calaveras,1.396509
5,Colusa,4.715549
6,Contra Costa,1.869199
7,Del Norte,0.868434
8,El Dorado,1.750104
9,Fresno,1.924815


In [26]:
hachman_denominator_sum['hachman_index'] = 1 / hachman_denominator_sum['scores']
hachman_denominator_sum

Unnamed: 0,County,scores,hi,hachman_index
0,Alameda,1.82186,0.54889,0.54889
1,Alpine,2.470772,0.404732,0.404732
2,Amador,1.013326,0.986849,0.986849
3,Butte,1.653901,0.604631,0.604631
4,Calaveras,1.396509,0.716071,0.716071
5,Colusa,4.715549,0.212064,0.212064
6,Contra Costa,1.869199,0.534988,0.534988
7,Del Norte,0.868434,1.151498,1.151498
8,El Dorado,1.750104,0.571395,0.571395
9,Fresno,1.924815,0.51953,0.51953


### Final answer in hachman_calculation df, the numbers are way too small

In [None]:
hachman_calculation

### The code below is running the calculation steps together

In [None]:
merged_data = filtered_hachman_data.merge(california_employ_data, on='Industry')
denominator = pd.DataFrame(merged_data)
merged_data['EMP_COUNTY'] = merged_data['EMP_COUNTY'].str.replace(',', '').astype(float)
merged_data['EMP_CA'] = merged_data['EMP_CA'].str.replace(',', '').astype(float)

denominator['emp_county_div_emp_ca'] = (merged_data['EMP_COUNTY'] / merged_data['EMP_CA']) * merged_data['EMP_COUNTY']

pd.set_option('display.float_format', '{:,.2f}'.format)

denominator_county_sum = denominator.groupby('County')['emp_county_div_emp_ca'].sum().reset_index()
denominator_county_sum.head()

hachman_calculation = pd.DataFrame(denominator_county_sum)
hachman_calculation['hachman_index_value'] = 1 / denominator_county_sum['emp_county_div_emp_ca']
hachman_calculation
