# Cal-CRAI Metric Calculation for: 
This notebook calculates one metric sourced from the Environmental Protection Agency

* Percentage of days with unhealthy or worse air quality days per county between 1980-2022

In [1]:
import pandas as pd
import os
import sys

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/social_services/census_bureau_soc_services/listed_soc_services/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'multiple_social_services_2021_CB2100CBP.csv'
Saved DataFrame as 'multiple_social_services_2021_CB2100CBP_metadata.csv'


In [3]:
# read in social service data
social_services_data = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv')
print(len(social_services_data))
social_services_data.head(5)
os.remove('multiple_social_services_2021_CB2100CBP.csv')

1067


In [4]:
# drop duplicates, original dataset had a repeat year
# filtered_air_quality = air_quality.drop_duplicates(subset=['Year', 'County'])
social_services_data.head(5)

Unnamed: 0,GEO_ID,NAME,GEO_ID_F,NAICS2017,NAICS2017_F,NAICS2017_LABEL,LFO,LFO_LABEL,EMPSZES,EMPSZES_LABEL,...,PAYANN_N_F,PAYQTR1,PAYQTR1_F,PAYQTR1_N,PAYQTR1_N_F,EMP,EMP_F,EMP_N,EMP_N_F,Unnamed: 25
0,Geographic identifier code,Geographic Area Name,Geo Footnote,2017 NAICS code,2017 NAICS Footnote,Meaning of NAICS Code,Legal form of organization code,Meaning of Legal form of organization code,Employment size of establishments code,Meaning of Employment size of establishments code,...,Flag for Noise range for annual payroll,"First-quarter payroll ($1,000)",Flag for first-quarter payroll,Noise range for first-quarter payroll,Flag for Noise range for first-quarter payroll,Number of employees,Flag for number of employees,Noise range for number of employees,Flag for Noise range for number of employees,
1,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,001,All establishments,...,H,165683,,H,H,6088,,H,H,
2,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,210,Establishments with less than 5 employees,...,N,N,N,N,N,N,N,N,N,
3,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,220,Establishments with 5 to 9 employees,...,N,N,N,N,N,N,N,N,N,
4,0500000US06001,"Alameda County, California",,237,,Heavy and civil engineering construction,001,All establishments,230,Establishments with 10 to 19 employees,...,N,N,N,N,N,N,N,N,N,


In [5]:
# Set the first row as new column names
new_headers = social_services_data.iloc[0]

# Replace the headers with the first row values
social_services_data = social_services_data.rename(columns=new_headers)

# Remove the first row (if needed)
social_services_data = social_services_data.drop(0)

In [6]:
social_services_columns = social_services_data[['Geographic Area Name', 'Meaning of NAICS Code', 'Meaning of Employment size of establishments code', 'Number of establishments']]
social_services_columns

Unnamed: 0,Geographic Area Name,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments
1,"Alameda County, California",Heavy and civil engineering construction,All establishments,113
2,"Alameda County, California",Heavy and civil engineering construction,Establishments with less than 5 employees,35
3,"Alameda County, California",Heavy and civil engineering construction,Establishments with 5 to 9 employees,13
4,"Alameda County, California",Heavy and civil engineering construction,Establishments with 10 to 19 employees,19
5,"Alameda County, California",Heavy and civil engineering construction,Establishments with 20 to 49 employees,21
...,...,...,...,...
1062,"Yuba County, California",Specialty trade contractors,Establishments with less than 5 employees,47
1063,"Yuba County, California",Specialty trade contractors,Establishments with 5 to 9 employees,12
1064,"Yuba County, California",Specialty trade contractors,Establishments with 10 to 19 employees,8
1065,"Yuba County, California",Health and personal care stores,All establishments,8


In [9]:
# Create a copy of the DataFrame
social_services_columns = social_services_columns.copy()

# Create a new column 'county' by extracting characters after the first '-'
social_services_columns['county'] = social_services_columns['Geographic Area Name'].str.split(',').str[0]

# Remove any mention of 'county' within the legalAgencyName column
social_services_columns['county'] = social_services_columns['county'].str.replace(' County', '', case=False)

# Drop the 'Geographic Area Name' column
social_services_county = social_services_columns.drop('Geographic Area Name', axis=1)

social_services_county.head(5)

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
2,Heavy and civil engineering construction,Establishments with less than 5 employees,35,Alameda
3,Heavy and civil engineering construction,Establishments with 5 to 9 employees,13,Alameda
4,Heavy and civil engineering construction,Establishments with 10 to 19 employees,19,Alameda
5,Heavy and civil engineering construction,Establishments with 20 to 49 employees,21,Alameda


In [10]:
social_services_county = social_services_county[social_services_columns['Meaning of Employment size of establishments code'] == 'All establishments'].copy()
social_services_county.head(50)

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
8,Specialty trade contractors,All establishments,1793,Alameda
17,Health and personal care stores,All establishments,370,Alameda
23,Blood and organ banks,All establishments,7,Alameda
24,Hospitals,All establishments,42,Alameda
32,Personal and household goods repair and mainte...,All establishments,84,Alameda
36,Heavy and civil engineering construction,All establishments,9,Amador
39,Specialty trade contractors,All establishments,59,Amador
44,Health and personal care stores,All establishments,4,Amador
45,Personal and household goods repair and mainte...,All establishments,3,Amador


In [11]:
filtered_social_services, omitted_df = filter_counties(social_services_county, 'county')

In [12]:
print(len(filtered_social_services))
print(len(omitted_df))

251
0


In [22]:
filtered_social_services

Unnamed: 0,Meaning of NAICS Code,Meaning of Employment size of establishments code,Number of establishments,county
1,Heavy and civil engineering construction,All establishments,113,Alameda
8,Specialty trade contractors,All establishments,1793,Alameda
17,Health and personal care stores,All establishments,370,Alameda
23,Blood and organ banks,All establishments,7,Alameda
24,Hospitals,All establishments,42,Alameda
...,...,...,...,...
1056,Hospitals,All establishments,3,Yolo
1057,Personal and household goods repair and mainte...,All establishments,8,Yolo
1059,Heavy and civil engineering construction,All establishments,10,Yuba
1061,Specialty trade contractors,All establishments,68,Yuba


## Our metrics are per 10,000 people, so we need to add our population data
* these metrics are at the county level so we need need county level population estimates

In [23]:
tracts_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tracts_county = pd.read_csv(tracts_county)
#ca_tracts_county

In [24]:
est_pop = "s3://ca-climate-index/0_map_data/cri_acs_demographic_estimated_population.csv"
ca_est_pop = pd.read_csv(est_pop)
ca_est_pop = ca_est_pop.rename(columns={'Census_Tract': 'TRACT'})

#ca_est_pop

In [25]:
merged_df = pd.merge(ca_tracts_county, ca_est_pop[['TRACT', 'est_total_pop']], on='TRACT', how='left')
county_est_pop_2021 = merged_df.groupby('County')['est_total_pop'].sum().reset_index()
county_est_pop_2021 = county_est_pop_2021.rename(columns={'County':'county'})

county_est_pop_2021.head()

Unnamed: 0,county,est_total_pop
0,Alameda,1663823
1,Alpine,1515
2,Amador,40577
3,Butte,213605
4,Calaveras,45674


In [29]:
county_est_pop_2021['est_total_pop'] = county_est_pop_2021['est_total_pop'].astype(float)

In [32]:
from pandas.api.types import is_float_dtype

In [33]:
is_float_dtype(county_est_pop_2021['est_total_pop'])

True

In [27]:
county_est_pop_2021.to_csv('county_est_pop_2021.csv')

## Uploading our new estimated population per county csv data

In [28]:
county_est_pop_2021.to_csv('county_est_pop_2021.csv')
bucket_name = 'ca-climate-index'
directory = '0_map_data'
upload_csv_aws('county_est_pop_2021.csv', bucket_name, directory)

FileNotFoundError: [Errno 2] No such file or directory: 'c'

## Function call

In [42]:
import numpy as np
import pandas as pd

def make_social_service_metrics(df, export=False): 
    # Read in social service data
    df = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv', header=1)
    
    # # Set the first row as new column names
    # new_headers = df.iloc[0]

    # # Replace the headers with the first row values
    # df = df.rename(columns=new_headers)

    # # Remove the first row (if needed)
    # df = df.drop(0)
        
    social_services_columns = df[['Geographic Area Name', 'Meaning of NAICS Code', 'Meaning of Employment size of establishments code', 'Number of establishments']]
    
    # Create a copy of the DataFrame
    social_services_columns = social_services_columns.copy()

    # Create a new column 'county' by extracting characters after the first '-'
    social_services_columns['county'] = social_services_columns['Geographic Area Name'].str.split(',').str[0]

    # Remove any mention of 'county' within the legalAgencyName column
    social_services_columns['county'] = social_services_columns['county'].str.replace(' County', '', case=False)

    # Drop the 'Geographic Area Name' column
    social_services_county = social_services_columns.drop('Geographic Area Name', axis=1)
    
    social_services_county = social_services_county[social_services_columns['Meaning of Employment size of establishments code'] == 'All establishments'].copy()

    filtered_social_services, omitted_df = filter_counties(social_services_county, 'county')
      
    # Get unique values in the column
    unique_values = filtered_social_services['Meaning of NAICS Code'].unique()

    selected_columns = ['county', 'Meaning of NAICS Code', 'Number of establishments']

    # Create a dictionary to store DataFrames
    dfs = {}

    # Iterate over unique values and create a new DataFrame for each
    for value in unique_values:
        # Filter rows where the value matches and where 'Meaning of Employment size of establishments code' is 'All establishments'
        df_filtered = social_services_columns[(social_services_columns['Meaning of NAICS Code'] == value) & 
                                            (social_services_columns['Meaning of Employment size of establishments code'] == 'All establishments')][selected_columns].copy()
        
        # Add to the dictionary if any rows are filtered
        if not df_filtered.empty:
            dfs[value] = df_filtered

    # Merge with population data
    tracts_county = "s3://ca-climate-index/0_map_data/county_est_pop_2021.csv"
    ca_tracts_county = pd.read_csv(tracts_county)
    for key, df in dfs.items():
        # Merge the dataframe with population data
        df = pd.merge(df, ca_tracts_county[['county', 'est_total_pop']], on='county', how='left')
        # Update the original dictionary value with the modified dataframe
        dfs[key] = df
    
    # Process each dataframe
    for key, df in dfs.items():
        # Convert 'est_total_pop' column to numeric
        df['est_total_pop'] = df['est_total_pop'].astype(float)
        # df['est_total_pop'] = pd.to_numeric(df['est_total_pop'], errors='coerce')
        # Filter out non-numeric and zero values
        df = df[(pd.notnull(df['est_total_pop'])) & (df['est_total_pop'] != 0)]
        # Check if 'est_total_pop' column contains any non-numeric values
        # if df['est_total_pop'].dtype != np.number:
        if is_float_dtype(county_est_pop_2021['est_total_pop']) == False:
            print(f"Warning: Non-numeric values found in 'est_total_pop' column for {key}. Skipping calculations.")
            continue
        # Create a new column for each dataframe
        new_column_name = 'establishments_per_10000_people'
        df[new_column_name] = (df['Number of establishments'] / df['est_total_pop']) * 10000
        # Update the original dictionary value with the modified dataframe
        dfs[key] = df

    # Export data if export is True
    if export:
        for key, df in dfs.items():
            filename = f'society_social_services_{key.replace(" ", "_")}_metric.csv'
            print("Filename:", filename)  # Print the filename for debugging purposes
            df.to_csv(filename)
            
            # Define AWS upload parameters
            bucket_name = 'ca-climate-index'
            directory = '3_fair_data/index_data'
            
            # Upload to AWS
            upload_csv_aws(filename, bucket_name, directory)
            
            # Remove local file
            #if os.path.exists(filename):
            #    os.remove(filename)

# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/social_services/census_bureau_soc_services/listed_soc_services/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

# read in social service data
social_services_data = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv')
make_social_service_metrics(social_services_data, export=True)

Saved DataFrame as 'multiple_social_services_2021_CB2100CBP.csv'
Saved DataFrame as 'multiple_social_services_2021_CB2100CBP_metadata.csv'
Filename: society_social_services_Heavy_and_civil_engineering_construction_metric.csv


FileNotFoundError: [Errno 2] No such file or directory: 's'

In [62]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/social_services/census_bureau_soc_services/listed_soc_services/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

# read in social service data
social_services_data = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv')
make_social_service_metrics(social_services_data, export=True)

Saved DataFrame as 'multiple_social_services_2021_CB2100CBP.csv'
Saved DataFrame as 'multiple_social_services_2021_CB2100CBP_metadata.csv'


  if df['est_total_pop'].dtype != np.number:
  if df['est_total_pop'].dtype != np.number:
  if df['est_total_pop'].dtype != np.number:
  if df['est_total_pop'].dtype != np.number:
  if df['est_total_pop'].dtype != np.number:
  if df['est_total_pop'].dtype != np.number:


Filename: society_social_services_Heavy_and_civil_engineering_construction_metric.csv
society_social_services_heavy_and_civil_engineering_construction_metric.csv uploaded to AWS
Filename: society_social_services_Specialty_trade_contractors_metric.csv
society_social_services_specialty_trade_contractors_metric.csv uploaded to AWS
Filename: society_social_services_Health_and_personal_care_stores_metric.csv
society_social_services_health_and_personal_care_stores_metric.csv uploaded to AWS
Filename: society_social_services_Blood_and_organ_banks_metric.csv
society_social_services_blood_and_organ_banks_metric.csv uploaded to AWS
Filename: society_social_services_Hospitals_metric.csv
society_social_services_hospitals_metric.csv uploaded to AWS
Filename: society_social_services_Personal_and_household_goods_repair_and_maintenance_metric.csv
society_social_services_personal_and_household_goods_repair_and_maintenance_metric.csv uploaded to AWS
