## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Social Services

This notebook calculates six metrics, sourced from the United States Census Bureau - County Business Patterns:
* Metric 1: Number of blood and organ banks per 10,000 people
* Metric 2: Number of hospitals per 10,000 people
* Metric 3: Number of health and personal care store facilities per 10,000 people
* Metric 4: Number of heavy and civil engineering construction establishments per 10,000 people
* Metric 5: Number of specialty trade contractors per 10,000 people
* Metric 6: Number of household appliance stores and repair establishments per 10,000 people

In [1]:
import pandas as pd
import os
import sys

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/society_economy/social_services/census_bureau_soc_services/listed_soc_services/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

In [None]:
# read in social service data
social_services_data = pd.read_csv('multiple_social_services_2021_CB2100CBP.csv')
print(len(social_services_data))
social_services_data.head(5)
os.remove('multiple_social_services_2021_CB2100CBP.csv')

In [None]:
social_services_data.head(5)

In [5]:
# Set the first row as new column names
new_headers = social_services_data.iloc[0]

# Replace the headers with the first row values
social_services_data = social_services_data.rename(columns=new_headers)

# Remove the first row (if needed)
social_services_data = social_services_data.drop(0)

In [None]:
social_services_columns = social_services_data[['Geographic Area Name', 'Meaning of NAICS Code', 'Meaning of Employment size of establishments code', 'Number of establishments']]
social_services_columns

In [None]:
# Create a copy of the DataFrame
social_services_columns = social_services_columns.copy()

# Create a new column 'county' by extracting characters after the first '-'
social_services_columns['county'] = social_services_columns['Geographic Area Name'].str.split(',').str[0]

# Remove any mention of 'county' within the legalAgencyName column
social_services_columns['county'] = social_services_columns['county'].str.replace(' County', '', case=False)

# Drop the 'Geographic Area Name' column
social_services_county = social_services_columns.drop('Geographic Area Name', axis=1)

social_services_county.head(5)

In [None]:
social_services_county = social_services_county[social_services_columns['Meaning of Employment size of establishments code'] == 'All establishments'].copy()
social_services_county.head()

In [None]:
social_services_county.loc[social_services_county.county == 'Colusa']

In [9]:
filtered_social_services, omitted_df = filter_counties(social_services_county, 'county')

In [None]:
print(len(filtered_social_services))
print(len(omitted_df))

In [None]:
filtered_social_services

In [None]:
further_filtered_social_services = filtered_social_services.drop(columns='Meaning of Employment size of establishments code')
further_filtered_social_services

In [None]:
further_filtered_social_services.loc[further_filtered_social_services['county'] == 'Colusa']

In [None]:
adjusted_social_services = further_filtered_social_services.pivot(index='county', columns='Meaning of NAICS Code', values='Number of establishments')
# Reset index to make 'county' a column again
adjusted_social_services = adjusted_social_services.reset_index()

# Flatten the multi-level header by setting the columns
adjusted_social_services.columns.name = None  # Remove the multi-level header name
adjusted_social_services.columns = [str(col).lower() for col in adjusted_social_services.columns]  # Flatten the header

adjusted_social_services.head()

### Our metrics are per 10,000 people, so we need to add our population data
* these metrics are at the county level so we need need county level population estimates

In [None]:
county_pop = "s3://ca-climate-index/0_map_data/county_est_pop_2022.csv"
ca_pop_county = pd.read_csv(county_pop)
ca_pop_county = ca_pop_county.drop(columns='Unnamed: 0')
ca_pop_county.head()

In [None]:
merged_county_social_services = pd.merge(ca_pop_county, adjusted_social_services, on='county', how='right')
merged_county_social_services.head()

In [None]:
# Create a new DataFrame with the same 'county' and 'est_total_pop' columns
social_services_per_10000 = merged_county_social_services[['county', 'est_total_pop']].copy()

# Convert necessary columns to numeric if they haven't been converted already
for column in merged_county_social_services.columns:
    if column not in ['county', 'est_total_pop']:
        merged_county_social_services[column] = pd.to_numeric(merged_county_social_services[column], errors='coerce')

# Iterate over the columns to rename them and calculate the new values
for column in merged_county_social_services.columns:
    if column not in ['county', 'est_total_pop']:
        # Rename the column
        new_column_name = f"{column} per 10000 people"
        if new_column_name not in social_services_per_10000.columns:
            # Calculate the new values
            social_services_per_10000[new_column_name] = (merged_county_social_services[column] / merged_county_social_services['est_total_pop']) * 10000

# Display the resulting DataFrame
social_services_per_10000.head()

In [None]:
tract_county_data = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
county_tract = pd.read_csv(tract_county_data)
county_tract = county_tract.rename(columns={'TRACT': 'census_tract', 'County': 'county'})
county_tract = county_tract.drop(columns=['Unnamed: 0', 'COUNTYFP'])
county_tract

In [None]:
social_services_metric_tract = pd.merge(county_tract, social_services_per_10000, on='county', how='left')
social_services_metric_tract

In [None]:
colusa = social_services_per_10000[social_services_per_10000['county']=='Colusa']
print('single colusa metric data before merging to census:')
display(colusa)

colusa = social_services_metric_tract[social_services_metric_tract['county']=='Colusa']
print(f'all colusa tracts after merging:')
display(colusa)

In [None]:
# List to store generated CSV file names
csv_file_names = []

for column in social_services_metric_tract.columns:
    if column not in ['county', 'census_tract','est_total_pop']:
    # Create new DataFrame
        new_df = social_services_metric_tract[['census_tract', 'county']].copy()
        new_column_name = column.replace(' ', '_')
        
        # Remove "per 10000 people" from the column name if it exists
        cleaned_column_name = new_column_name.replace('per_10000_people', '')
        # Define CSV file name based on the new column name
        csv_filename = 'society_' + cleaned_column_name + 'metric.csv'

        new_df[column] = social_services_metric_tract[column]

        # Save the DataFrame to CSV
        new_df.to_csv(csv_filename, index=False)

        print(f"Saved DataFrame to: {csv_filename}")

In [None]:
blood_organ = pd.read_csv('society_blood_and_organ_banks_metric.csv')
blood_organ

## Function call

In [22]:
# pull csv from aws
bucket_name = 'ca-climate-index'

@append_metadata
def social_services_metric_calc(input_csv, export=False, varname=""):  
    '''
    The function calculates the social services metrics sourced from the United States Census Bureau:
    County Business Patterns. The metrics include:
    
    * # of blood and organ banks per 10,000 people
    * # of hospitals per 10,000 people
    * # of health and personal care store facilities per 10,000 people
    * # of heavy and civil engineering construction establishments per 10,000 people
    * # of specialty trade contractors per 10,000 people
    * # of household appliance stores and repair establishments per 10,000 people

    Parameters
    ----------
    input_csv: list
        list of calculated metric csv's
    export: bool, optional
        True to upload csvs to AWS, False otherwise.

    Methods
    --------
    Relevant columns for Cal-CRAI metrics were isolated from the original dataset.
    Specific entries were omitted to isolate for the CA population.
    Entries within rows were converted to columns for better metric entry/visualization.
    Cal-CRAI tracts were merged in with the data, missing data from the extra tracts 
    were given values for each metric column based on the average values from matching
    county entries.
    Metric columns were isolated to their own dataframe and uploaded to AWS.
    Missing data is kept as is (i.e., "NaN") to avoid misrepresentation by setting to zero.

    Script
    ------
    society_social_business.ipynb

    Note
    ------
    This function assumes users have configured the AWS CLI such that their access key / 
    secret key pair are stored in ~/.aws/credentials. 
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    ''' 
    print('Data transformation: eliminate excess headers and columns not relevant to metric calculation.')
    print('Data transformation: rename and adjust column entires to maintain cleaning standardization.')
    print('Data transformation: flatten data so metric variables become columns.')
    print('Data transformation: merge data to 2022 population data to calculate per 10,000 people portion of metric.')
    print('Data transformation: merge data to 2021 tract data to get 9,129 tracts.')
    if export == True:
        # pull csv from aws
        bucket_name = 'ca-climate-index'
        upload_csv_aws([input_csv], bucket_name, '3_fair_data/index_data')

    if export == False:
        print(f'{input_csv} uploaded to AWS.')

       # os.remove(input_csv)  # Remove local file after upload

In [23]:
input_csv = [
'society_blood_and_organ_banks_metric.csv',
'society_health_and_personal_care_stores_metric.csv',
'society_heavy_and_civil_engineering_construction_metric.csv',
'society_hospitals_metric.csv',
'society_personal_and_household_goods_repair_and_maintenance_metric.csv',
'society_specialty_trade_contractors_metric.csv'
]

varnames = [
    'society_census_business_blood_organ_banks',
    'society_census_business_health_store_facilities',
    'society_census_business_engineering_construction',
    'society_census_business_hospitals',
    'society_census_business_appliance_repair_establishments',
    'society_census_business_specialty_trade'
]

# Process the data and export
for csv, var in zip(input_csv, varnames):
    social_services_metric_calc(csv,  export=True, varname=var)