### Cal-CRAI Metric Calculation for: Economic Health
* Hachman Index

In [1]:
import pandas as pd
import os
import sys
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# Define a function to display data for a specific county
def display_county_data(df, county_name):
    county_data = df[df['County'] == county_name]
    if county_data.empty:
        print(f"No data found for {county_name}")
    else:
        print(f"Data for {county_name}:")
        display(county_data)  

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/society_economy/economic_health/bureau_labor_statistics/employment_hachman_index/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

aws_dir = '1_pull_data/society_economy/economic_health/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'employment_data_hachman_subset.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Column-Metadata.csv'
Saved DataFrame as 'ACSDT5Y2022.B19083-Data.csv'
Saved DataFrame as 'ACSST5Y2022.S1901-Column-Metadata.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'ACSST5Y2022.S1901-Data.csv'


  df = pd.read_csv(csv_data)


Saved DataFrame as 'allhlcn22.csv'
Saved DataFrame as 'data_layout.csv'


In [4]:
hachman_data = pd.read_csv('employment_data_hachman_subset.csv')

In [5]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2022.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'Census Tract': 'GEO_ID'})
ca_county_tract = ca_county_tract.drop(columns={'Unnamed: 0', 'COUNTYFP', 'County', 'Total Population 2021'})

### Hachman Index Metric
$$
HI = \frac{1}{Σ_j (\ \left(\frac{E_si}{E_ri}\right) \cdot E_si)}
$$

{E_si} is share of area economic indicator in industry i -- county level <br>
{E_ri} is share of regions economic indicator in industry i -- state level

Several levels of data clean-up need to occur to calculate the Hachman Index. 
* Clean-up the County naming, dropping all statewide metrics
* Drop counts for "Service-providing" and "Goods-producing" -- these sum other columns! 

In [6]:
len(hachman_data)

1074

In [7]:
hachman_data.head(18)

Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.0,1.0
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.0
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.0
5,6000,6,0,5,101,2022,A,State,California,California -- Statewide,Private,101 Goods-producing,,153194,2685113,252628954106,1809,94085,1.01,1.03
6,6000,6,0,5,1011,2022,A,State,California,California -- Statewide,Private,1011 Natural resources and mining,,17518,436965,19761965357,870,45226,2.02,1.14
7,6000,6,0,5,1012,2022,A,State,California,California -- Statewide,Private,1012 Construction,,90502,912359,75509315341,1592,82763,0.99,0.93
8,6000,6,0,5,1013,2022,A,State,California,California -- Statewide,Private,1013 Manufacturing,,45174,1335789,157357673408,2265,117801,0.88,1.08
9,6000,6,0,5,102,2022,A,State,California,California -- Statewide,Private,102 Service-providing,,1517065,12753442,1051868129589,1586,82477,1.0,0.99


Drop the following rows:
* "101 Goods-producing" (it's a summary of the 101X categories)
* "102 Service-providing" (it's a summary of the 102X categories)

In [8]:
# create a Boolean mask for the rows to remove
mask101 = hachman_data['Industry'] == '101 Goods-producing'
mask102 = hachman_data['Industry'] == '102 Service-providing'

# select all rows except the ones that contain either
hachman_data_cleaned = hachman_data[~mask101]
hachman_data_cleaned = hachman_data_cleaned[~mask102]
hachman_data_cleaned

  hachman_data_cleaned = hachman_data_cleaned[~mask102]


Unnamed: 0,Area\nCode,St,Cnty,Own,NAICS,Year,Qtr,Area Type,St Name,Area,Ownership,Industry,Annual Average Status Code,Annual Average Establishment Count,Annual Average Employment,Annual Total Wages,Annual Average Weekly Wage,Annual Average Pay,Employment Location Quotient Relative to U.S.,Total Wage Location Quotient Relative to U.S.
0,6000,6,0,0,10,2022,A,State,California,California -- Statewide,Total Covered,"10 Total, all industries",,1706672,17903539,1511706499616,1624,84436,1.00,1.00
1,6000,6,0,1,10,2022,A,State,California,California -- Statewide,Federal Government,"10 Total, all industries",,3286,247795,23655167888,1836,95463,0.73,0.62
2,6000,6,0,2,10,2022,A,State,California,California -- Statewide,State Government,"10 Total, all industries",,13784,492218,47749398000,1866,97009,0.91,1.00
3,6000,6,0,3,10,2022,A,State,California,California -- Statewide,Local Government,"10 Total, all industries",,19344,1724972,135804850033,1514,78729,1.04,1.11
4,6000,6,0,5,10,2022,A,State,California,California -- Statewide,Private,"10 Total, all industries",,1670259,15438555,1304497083695,1625,84496,1.01,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1069,6999,6,999,5,1024,2022,A,County,California,"Unknown Or Undefined, California",Private,1024 Professional and business services,,26188,236341,25491212708,2074,107858,3.05,2.03
1070,6999,6,999,5,1025,2022,A,County,California,"Unknown Or Undefined, California",Private,1025 Education and health services,,4401,30314,2854407977,1811,94162,0.38,0.34
1071,6999,6,999,5,1026,2022,A,County,California,"Unknown Or Undefined, California",Private,1026 Leisure and hospitality,,2121,10276,932147336,1744,90713,0.19,0.33
1072,6999,6,999,5,1027,2022,A,County,California,"Unknown Or Undefined, California",Private,1027 Other services,N,3031,0,0,0,0,0.00,0.00


Taking a look at the entries within area
* separating California entries as we will need that data also
* getting rid of all 'County, California' portions of each entry

In [9]:
# Get unique entries in 'Column1'
unique_entries = hachman_data_cleaned['Area'].unique()
# print(unique_entries)

In [10]:
hachman_data_cleaned = hachman_data_cleaned[['Area', 'Industry', 'Annual Average Employment']]
# Remove any mention of 'county' within the legalAgencyName column
hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)

unique_entries = hachman_data_cleaned['Area'].unique()
hachman_data_cleaned = hachman_data_cleaned.rename(columns={'Area':'County'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' -- Statewide', '', case=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hachman_data_cleaned['Area'] = hachman_data_cleaned['Area'].str.replace(' County, California', '', case=False)


In [11]:
hachman_data_cleaned

Unnamed: 0,County,Industry,Annual Average Employment
0,California,"10 Total, all industries",17903539
1,California,"10 Total, all industries",247795
2,California,"10 Total, all industries",492218
3,California,"10 Total, all industries",1724972
4,California,"10 Total, all industries",15438555
...,...,...,...
1069,"Unknown Or Undefined, California",1024 Professional and business services,236341
1070,"Unknown Or Undefined, California",1025 Education and health services,30314
1071,"Unknown Or Undefined, California",1026 Leisure and hospitality,10276
1072,"Unknown Or Undefined, California",1027 Other services,0


In [12]:
display_county_data(hachman_data_cleaned, 'Alameda')

Data for Alameda:


Unnamed: 0,County,Industry,Annual Average Employment
18,Alameda,"10 Total, all industries",787020
19,Alameda,"10 Total, all industries",8304
20,Alameda,"10 Total, all industries",25618
21,Alameda,"10 Total, all industries",70792
22,Alameda,"10 Total, all industries",682306
24,Alameda,1011 Natural resources and mining,959
25,Alameda,1012 Construction,47512
26,Alameda,1013 Manufacturing,97583
28,Alameda,"1021 Trade, transportation, and utilities",135720
29,Alameda,1022 Information,18269


There are multiple entries of '10 Total, all industries' within the Industry column for each county (and CA as a whole)
* The first entry per county is the largest and sum of all other "10 Total, all industry" rows, indicating total employment for that county
   * The first is government + private industry; the last is only industry
* The code below retains the last number of this group and makes a new df containing these values

In [13]:
df = pd.DataFrame(hachman_data_cleaned)

# Filter for '10 Total, all industries' in the 'Industry' column
df_filtered = df[df['Industry'] == '10 Total, all industries']

# Remove commas and convert 'Annual Average Employment' to numeric
df_filtered['Total County Employment'] = df_filtered['Annual Average Employment'].str.replace(',', '').astype(float)

# Group by Area and get the last row
max_area_values = df_filtered.loc[df_filtered.groupby('County')['Total County Employment'].nlargest(2)]
max_area_values = max_area_values.drop(columns=['Industry','Annual Average Employment'])
max_area_values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Total County Employment'] = df_filtered['Annual Average Employment'].str.replace(',', '').astype(float)


KeyError: "None of [Index([787020.0, 682306.0,    668.0,    441.0,  12114.0,   7544.0,  78536.0,\n        63692.0,  10626.0,   7788.0,\n       ...\n        16981.0,  11889.0, 515383.0, 507797.0, 328204.0, 283653.0, 108921.0,\n        75092.0,  20314.0,  12296.0],\n      dtype='float64', length=120)] are in the [index]"

Now that we have a new df with the total employments, we can get rid of those entries within our main df

In [None]:
# create a Boolean mask for the rows to remove
mask = hachman_data_cleaned['Industry'] == '10 Total, all industries'
# select all rows except the ones that contain '10 Total, all industries' which are state and county employment totals, not needed
hachman_data_cleaned = hachman_data_cleaned[~mask]

Running our county function filter to separate CA counties and other entries

In [None]:
filtered_hachman_data, omitted_data = filter_counties(hachman_data_cleaned, 'County')

print('Counties kept:', len(filtered_hachman_data))
print('Omitted data entries:', len(omitted_data))

In [None]:
filtered_hachman_data.rename(columns={'Annual Average Employment': 'Industry Employed County'}, inplace=True)
filtered_hachman_data

Looking at the other entries, will isolate for just California data, which will be used in the Hachman calculation

In [None]:
omitted_data

In [None]:
# Get rid of the Unknown entries
mask = omitted_data['County'] == 'Unknown Or Undefined, California'
california_employ_data = omitted_data[~mask]
california_employ_data.rename(columns={'Annual Average Employment': 'Industry Employed CA', 'County': 'State'}, inplace=True)

# Now we have a df that holds all CA state employment per industry
california_employ_data

Merge the county and california employment datasets together based on industry so we have county employment and state employment per industry

In [None]:
# Merge county industry employment with state industry employment
merged_data = filtered_hachman_data.merge(california_employ_data, on='Industry')
merged_data

Merge the result to the total employment values based on county
* result gives us an additional column with total county values

In [None]:
further_merged_data = merged_data.merge(max_area_values, on='County')
further_merged_data

Add another column with the total employment value in California

In [None]:
hachman_denominator = pd.DataFrame(further_merged_data)
hachman_denominator['Total State Employment'] = 15438555.0 # from state employment private, total all industries count [index 4], excludes government

hachman_denominator

Calculate the Esi and Eri values
* Esi = (county employment in industry i / total county employment for all industries)
* Eri = (state employment in industry i / total state employment for all industries)

Then we divide Esi by Eri, and multiply by Esi to create our hachman denominator column

In [None]:
hachman_denominator['Industry Employed CA'] = hachman_denominator['Industry Employed CA'].str.replace(',', '').astype(float)
hachman_denominator['Industry Employed County'] = hachman_denominator['Industry Employed County'].str.replace(',', '').astype(float)

In [None]:
# county score per industry -- fraction of county employment in industry
hachman_denominator['county_industry_frac'] = hachman_denominator['Industry Employed County'] / hachman_denominator['Total County Employment']

# state score per industry -- fraction of state employment in industry
hachman_denominator['state_industry_frac'] = hachman_denominator['Industry Employed CA'] / hachman_denominator['Total State Employment']
hachman_denominator

In [None]:
hachman_denominator['scores'] = (hachman_denominator['county_industry_frac'] / hachman_denominator['state_industry_frac']) * hachman_denominator['county_industry_frac']
hachman_denominator

In [None]:
hachman_denominator.loc[hachman_denominator.County == 'Sierra']

Now we sum the Hachman denominator values together per county
* Group by the county and sum Hachman denominator

In [None]:
hachman_denominator_sum = hachman_denominator.groupby('County')['scores'].sum().reset_index()
hachman_denominator_sum

In [None]:
hachman_denominator_sum['hachman_index'] = 1 / hachman_denominator_sum['scores']
hachman_denominator_sum

### Final answer in hachman_calculation df, the numbers are way too small

In [None]:
hachman_calculation

### The code below is running the calculation steps together

In [None]:
merged_data = filtered_hachman_data.merge(california_employ_data, on='Industry')
denominator = pd.DataFrame(merged_data)
merged_data['EMP_COUNTY'] = merged_data['EMP_COUNTY'].str.replace(',', '').astype(float)
merged_data['EMP_CA'] = merged_data['EMP_CA'].str.replace(',', '').astype(float)

denominator['emp_county_div_emp_ca'] = (merged_data['EMP_COUNTY'] / merged_data['EMP_CA']) * merged_data['EMP_COUNTY']

pd.set_option('display.float_format', '{:,.2f}'.format)

denominator_county_sum = denominator.groupby('County')['emp_county_div_emp_ca'].sum().reset_index()
denominator_county_sum.head()

hachman_calculation = pd.DataFrame(denominator_county_sum)
hachman_calculation['hachman_index_value'] = 1 / denominator_county_sum['emp_county_div_emp_ca']
hachman_calculation
