# Cal-CRAI Metric Calculation for: 
This notebook calculates one metric sourced from the Environmental Protection Agency

* Percentage of days with unhealthy or worse air quality days per county between 1980-2022

In [2]:
import pandas as pd
import os
import sys

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/social_services/census_bureau_soc_services/listed_soc_services/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'multiple_social_services_2021_CB2100CBP.csv'
Saved DataFrame as 'multiple_social_services_2021_CB2100CBP_metadata.csv'


In [4]:
# read in social service data
social_services_data = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv')
print(len(social_services_data))
social_services_data.head(5)
os.remove('multiple_social_services_2021_CB2100CBP.csv')

1067


In [5]:
social_services_data.head(5)

Unnamed: 0,GEO_ID,NAME,GEO_ID_F,NAICS2017,NAICS2017_F,NAICS2017_LABEL,LFO,LFO_LABEL,EMPSZES,EMPSZES_LABEL,...,PAYANN_N_F,PAYQTR1,PAYQTR1_F,PAYQTR1_N,PAYQTR1_N_F,EMP,EMP_F,EMP_N,EMP_N_F,Unnamed: 25
0,Geographic identifier code,Geographic Area Name,Geo Footnote,2017 NAICS code,2017 NAICS Footnote,Meaning of NAICS Code,Legal form of organization code,Meaning of Legal form of organization code,Employment size of establishments code,Meaning of Employment size of establishments code,...,Flag for Noise range for annual payroll,"First-quarter payroll ($1,000)",Flag for first-quarter payroll,Noise range for first-quarter payroll,Flag for Noise range for first-quarter payroll,Number of employees,Flag for number of employees,Noise range for number of employees,Flag for Noise range for number of employees,
1,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,001,All establishments,...,H,165683,,H,H,6088,,H,H,
2,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,210,Establishments with less than 5 employees,...,N,N,N,N,N,N,N,N,N,
3,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,220,Establishments with 5 to 9 employees,...,N,N,N,N,N,N,N,N,N,
4,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,230,Establishments with 10 to 19 employees,...,N,N,N,N,N,N,N,N,N,


In [6]:
# Set the first row as new column names
new_headers = social_services_data.iloc[0]

# Replace the headers with the first row values
social_services_data = social_services_data.rename(columns=new_headers)

# Remove the first row (if needed)
social_services_data = social_services_data.drop(0)

In [7]:
social_services_columns = social_services_data[['Geographic Area Name', 'Meaning of NAICS Code', 'Meaning of Employment size of establishments code', 'Number of establishments']]
social_services_columns

Unnamed: 0,Geographic Area Name,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments
1,"Alameda County, California",Heavy and civil engineering construction,All establishments,113
2,"Alameda County, California",Heavy and civil engineering construction,Establishments with less than 5 employees,35
3,"Alameda County, California",Heavy and civil engineering construction,Establishments with 5 to 9 employees,13
4,"Alameda County, California",Heavy and civil engineering construction,Establishments with 10 to 19 employees,19
5,"Alameda County, California",Heavy and civil engineering construction,Establishments with 20 to 49 employees,21
...,...,...,...,...
1062,"Yuba County, California",Specialty trade contractors,Establishments with less than 5 employees,47
1063,"Yuba County, California",Specialty trade contractors,Establishments with 5 to 9 employees,12
1064,"Yuba County, California",Specialty trade contractors,Establishments with 10 to 19 employees,8
1065,"Yuba County, California",Health and personal care stores,All establishments,8


In [8]:
# Create a copy of the DataFrame
social_services_columns = social_services_columns.copy()

# Create a new column 'county' by extracting characters after the first '-'
social_services_columns['county'] = social_services_columns['Geographic Area Name'].str.split(',').str[0]

# Remove any mention of 'county' within the legalAgencyName column
social_services_columns['county'] = social_services_columns['county'].str.replace(' County', '', case=False)

# Drop the 'Geographic Area Name' column
social_services_county = social_services_columns.drop('Geographic Area Name', axis=1)

social_services_county.head(5)

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
2,Heavy and civil engineering construction,Establishments with less than 5 employees,35,Alameda
3,Heavy and civil engineering construction,Establishments with 5 to 9 employees,13,Alameda
4,Heavy and civil engineering construction,Establishments with 10 to 19 employees,19,Alameda
5,Heavy and civil engineering construction,Establishments with 20 to 49 employees,21,Alameda


In [9]:
social_services_county = social_services_county[social_services_columns['Meaning of Employment size of establishments code'] == 'All establishments'].copy()
social_services_county.head()

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
8,Specialty trade contractors,All establishments,1793,Alameda
17,Health and personal care stores,All establishments,370,Alameda
23,Blood and organ banks,All establishments,7,Alameda
24,Hospitals,All establishments,42,Alameda


In [10]:
filtered_social_services, omitted_df = filter_counties(social_services_county, 'county')

In [11]:
print(len(filtered_social_services))
print(len(omitted_df))

251
0


In [12]:
filtered_social_services

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
8,Specialty trade contractors,All establishments,1793,Alameda
17,Health and personal care stores,All establishments,370,Alameda
23,Blood and organ banks,All establishments,7,Alameda
24,Hospitals,All establishments,42,Alameda
...,...,...,...,...
1056,Hospitals,All establishments,3,Yolo
1057,Personal and household goods repair and mainte...,All establishments,8,Yolo
1059,Heavy and civil engineering construction,All establishments,10,Yuba
1061,Specialty trade contractors,All establishments,68,Yuba


In [13]:
further_filtered_social_services = filtered_social_services.drop(columns='Meaning of Employment size of establishments code')
further_filtered_social_services

Unnamed: 0,Meaning of NAICS Code,Number of establishments,county
1,Heavy and civil engineering construction,113,Alameda
8,Specialty trade contractors,1793,Alameda
17,Health and personal care stores,370,Alameda
23,Blood and organ banks,7,Alameda
24,Hospitals,42,Alameda
...,...,...,...
1056,Hospitals,3,Yolo
1057,Personal and household goods repair and mainte...,8,Yolo
1059,Heavy and civil engineering construction,10,Yuba
1061,Specialty trade contractors,68,Yuba


In [14]:
adjusted_social_services = further_filtered_social_services.pivot(index='county', columns='Meaning of NAICS Code', values='Number of establishments')
# Reset index to make 'county' a column again
adjusted_social_services = adjusted_social_services.reset_index()

# Flatten the multi-level header by setting the columns
adjusted_social_services.columns.name = None  # Remove the multi-level header name
adjusted_social_services.columns = [str(col).lower() for col in adjusted_social_services.columns]  # Flatten the header

adjusted_social_services.head()

Unnamed: 0,county,blood and organ banks,health and personal care stores,heavy and civil engineering construction,hospitals,personal and household goods repair and maintenance,specialty trade contractors
0,Alameda,7.0,370,113,42.0,84.0,1793
1,Amador,,4,9,,3.0,59
2,Butte,,52,22,5.0,4.0,353
3,Calaveras,,7,9,,5.0,113
4,Colusa,,3,4,,,22


## Our metrics are per 10,000 people, so we need to add our population data
* these metrics are at the county level so we need need county level population estimates

In [15]:
county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_pop_county = pd.read_csv(county_pop)
ca_pop_county = ca_pop_county.drop(columns='Unnamed: 0')
ca_pop_county.head()

Unnamed: 0,county,est_total_pop
0,Alameda,1663823
1,Alpine,1515
2,Amador,40577
3,Butte,213605
4,Calaveras,45674


In [16]:
merged_county_social_services = pd.merge(ca_pop_county, adjusted_social_services, on='county', how='right')
merged_county_social_services.head()

Unnamed: 0,county,est_total_pop,blood and organ banks,health and personal care stores,heavy and civil engineering construction,hospitals,personal and household goods repair and maintenance,specialty trade contractors
0,Alameda,1663823,7.0,370,113,42.0,84.0,1793
1,Amador,40577,,4,9,,3.0,59
2,Butte,213605,,52,22,5.0,4.0,353
3,Calaveras,45674,,7,9,,5.0,113
4,Colusa,21811,,3,4,,,22


In [17]:

# Create a new DataFrame with the same 'county' and 'est_total_pop' columns
social_services_per_10000 = merged_county_social_services[['county', 'est_total_pop']].copy()

# Convert necessary columns to numeric if they haven't been converted already
for column in merged_county_social_services.columns:
    if column not in ['county', 'est_total_pop']:
        merged_county_social_services[column] = pd.to_numeric(merged_county_social_services[column], errors='coerce')

# Iterate over the columns to rename them and calculate the new values
for column in merged_county_social_services.columns:
    if column not in ['county', 'est_total_pop']:
        # Rename the column
        new_column_name = f"{column} per 10000 people"
        if new_column_name not in social_services_per_10000.columns:
            # Calculate the new values
            social_services_per_10000[new_column_name] = (merged_county_social_services[column] / merged_county_social_services['est_total_pop']) * 10000

# Display the resulting DataFrame
social_services_per_10000.head()

Unnamed: 0,county,est_total_pop,blood and organ banks per 10000 people,health and personal care stores per 10000 people,heavy and civil engineering construction per 10000 people,hospitals per 10000 people,personal and household goods repair and maintenance per 10000 people,specialty trade contractors per 10000 people
0,Alameda,1663823,0.042072,2.223794,0.679159,0.252431,0.504861,10.776387
1,Amador,40577,,0.98578,2.218005,,0.739335,14.540257
2,Butte,213605,,2.4344,1.029938,0.234077,0.187262,16.52583
3,Calaveras,45674,,1.532601,1.970486,,1.094715,24.740553
4,Colusa,21811,,1.375453,1.833937,,,10.086654


In [18]:
tract_county_data = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
county_tract = pd.read_csv(tract_county_data)
county_tract = county_tract.rename(columns={'TRACT': 'census_tract', 'County': 'county'})
county_tract = county_tract.drop(columns=['Unnamed: 0', 'COUNTYFP'])
county_tract

Unnamed: 0,census_tract,county
0,6085504321,Santa Clara
1,6085504410,Santa Clara
2,6085507003,Santa Clara
3,6085507004,Santa Clara
4,6085502204,Santa Clara
...,...,...
9124,6059001303,Orange
9125,6059001304,Orange
9126,6059001401,Orange
9127,6013367200,Contra Costa


In [19]:
social_services_metric_tract = pd.merge(county_tract, social_services_per_10000, on='county', how='left')
social_services_metric_tract

Unnamed: 0,census_tract,county,est_total_pop,blood and organ banks per 10000 people,health and personal care stores per 10000 people,heavy and civil engineering construction per 10000 people,hospitals per 10000 people,personal and household goods repair and maintenance per 10000 people,specialty trade contractors per 10000 people
0,6085504321,Santa Clara,1916831.0,,2.201550,0.506044,0.083471,0.511261,11.837246
1,6085504410,Santa Clara,1916831.0,,2.201550,0.506044,0.083471,0.511261,11.837246
2,6085507003,Santa Clara,1916831.0,,2.201550,0.506044,0.083471,0.511261,11.837246
3,6085507004,Santa Clara,1916831.0,,2.201550,0.506044,0.083471,0.511261,11.837246
4,6085502204,Santa Clara,1916831.0,,2.201550,0.506044,0.083471,0.511261,11.837246
...,...,...,...,...,...,...,...,...,...
9124,6059001303,Orange,3175227.0,0.050390,3.561950,0.894424,0.119676,0.834586,14.890274
9125,6059001304,Orange,3175227.0,0.050390,3.561950,0.894424,0.119676,0.834586,14.890274
9126,6059001401,Orange,3175227.0,0.050390,3.561950,0.894424,0.119676,0.834586,14.890274
9127,6013367200,Contra Costa,1162648.0,,1.840626,0.903111,0.197824,0.498861,13.125211


In [20]:
colusa = social_services_per_10000[social_services_per_10000['county']=='Colusa']
print('single colusa metric data before merging to census:')
display(colusa)

colusa = social_services_metric_tract[social_services_metric_tract['county']=='Colusa']
print(f'all colusa tracts after merging:')
display(colusa)

single colusa metric data before merging to census:


Unnamed: 0,county,est_total_pop,blood and organ banks per 10000 people,health and personal care stores per 10000 people,heavy and civil engineering construction per 10000 people,hospitals per 10000 people,personal and household goods repair and maintenance per 10000 people,specialty trade contractors per 10000 people
4,Colusa,21811,,1.375453,1.833937,,,10.086654


all colusa tracts after merging:


Unnamed: 0,census_tract,county,est_total_pop,blood and organ banks per 10000 people,health and personal care stores per 10000 people,heavy and civil engineering construction per 10000 people,hospitals per 10000 people,personal and household goods repair and maintenance per 10000 people,specialty trade contractors per 10000 people
4304,6011000200,Colusa,21811.0,,1.375453,1.833937,,,10.086654
4305,6011000500,Colusa,21811.0,,1.375453,1.833937,,,10.086654
4306,6011000100,Colusa,21811.0,,1.375453,1.833937,,,10.086654
4307,6011000400,Colusa,21811.0,,1.375453,1.833937,,,10.086654
7109,6011000302,Colusa,21811.0,,1.375453,1.833937,,,10.086654
7110,6011000301,Colusa,21811.0,,1.375453,1.833937,,,10.086654


In [21]:
# List to store generated CSV file names
csv_file_names = []

for column in social_services_metric_tract.columns:
    if column not in ['county', 'census_tract','est_total_pop']:
    # Create new DataFrame
        new_df = social_services_metric_tract[['census_tract', 'county']].copy()
        new_column_name = column.replace(' ', '_')
        
        # Remove "per 10000 people" from the column name if it exists
        cleaned_column_name = new_column_name.replace('per_10000_people', '')
        # Define CSV file name based on the new column name
        csv_filename = 'society_' + cleaned_column_name + 'metric.csv'

        new_df[column] = social_services_metric_tract[column]

        # Save the DataFrame to CSV
        new_df.to_csv(csv_filename, index=False)

        print(f"Saved DataFrame to: {csv_filename}")

Saved DataFrame to: society_blood_and_organ_banks_metric.csv
Saved DataFrame to: society_health_and_personal_care_stores_metric.csv
Saved DataFrame to: society_heavy_and_civil_engineering_construction_metric.csv
Saved DataFrame to: society_hospitals_metric.csv
Saved DataFrame to: society_personal_and_household_goods_repair_and_maintenance_metric.csv
Saved DataFrame to: society_specialty_trade_contractors_metric.csv


In [25]:
blood_organ = pd.read_csv('society_blood_and_organ_banks_metric.csv')
blood_organ

Unnamed: 0,census_tract,county,blood and organ banks per 10000 people
0,6085504321,Santa Clara,
1,6085504410,Santa Clara,
2,6085507003,Santa Clara,
3,6085507004,Santa Clara,
4,6085502204,Santa Clara,
...,...,...,...
9124,6059001303,Orange,0.050390
9125,6059001304,Orange,0.050390
9126,6059001401,Orange,0.050390
9127,6013367200,Contra Costa,


## Function call

In [22]:
# pull csv from aws
bucket_name = 'ca-climate-index'

@append_metadata
def social_services_metric_calc(input_csv, export=False, varname=""):  
    '''
    The function calculates the social services metrics sourced from the United States Census Bureau:
    County Business Patterns. The metrics include:
    
    * # of blood and organ banks per 10,000 people
    * # of hospitals per 10,000 people
    * # of health and personal care store facilities per 10,000 people
    * # of heavy and civil engineering construction establishments and specialty trade contractors per 10,000 people
    * # of specialty trade contractors per 10,000 people
    * # of household appliance stores and repair establishments per 10,000 people

    Parameters
    ----------
    input_csv: list
        list of calculated metric csv's
    export: bool, optional
        True to upload csvs to AWS, False otherwise.

    Methods
    --------
    Relevant columns for Cal-CRAI metrics were isolated from the original dataset.
    Specific entries were omitted to isolate for the CA population.
    Entries within rows were converted to columns for better metric entry/visualization.
    Cal-CRAI tracts were merged in with the data, missing data from the extra tracts 
    were given values for each metric column based on the average values from matching
    county entries.
    Metric columns were isolated to their own dataframe and uploaded to AWS.

    Script
    ------
    society_social_business.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    ''' 
    print('Data transformation: eliminate excess headers and columns not relevant to metric calculation')
    print('Data transformation: rename and adjust column entires to maintain cleaning standardization')
    print('Data transformation: flatten data so metric variables become columns')
    print('Data transformation: merge data to 2022 population data to calculate per 10,000 people portion of metric')
    print('Data transformation: merge data to 2021 tract data to get 9,129 tracts ')
    if export == True:
        # pull csv from aws
        bucket_name = 'ca-climate-index'
        upload_csv_aws([input_csv], bucket_name, '3_fair_data/index_data')
       # os.remove(input_csv)  # Remove local file after upload

In [23]:
input_csv = [
'society_blood_and_organ_banks_metric.csv',
'society_health_and_personal_care_stores_metric.csv',
'society_heavy_and_civil_engineering_construction_metric.csv',
'society_hospitals_metric.csv',
'society_personal_and_household_goods_repair_and_maintenance_metric.csv',
'society_specialty_trade_contractors_metric.csv'
]

varnames = [
    'society_census_business_blood_organ_banks',
    'society_census_business_health_store_facilities',
    'society_census_business_engineering_construction',
    'society_census_business_hospitals',
    'society_census_business_appliance_repair_establishments',
    'society_census_business_specialty_trade'
]

# Process the data and export
for csv, var in zip(input_csv, varnames):
    social_services_metric_calc(csv,  export=True, varname=var)