## Cal-CRAI Metric Calculation
Domain: Society & Economy \
Indicator: Economic health

This notebook calculates two metrics, sourced from the United States Census Bureau - American Community Survey:
* Metric 1: Gini Index
* Metric 2: Median Income

In [2]:
import pandas as pd
import os
import sys
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# Define a function to display data for a specific county
def display_tract_data(df, tract_num):
    tract_data = df[df['GEO_ID'] == tract_num]
    if tract_data.empty:
        print(f"No data found for {tract_num}")
    else:
        print(f"Data for {tract_num}:")
        display(tract_data)

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2a_subset/society_economy/economic_health/bureau_labor_statistics/employment_hachman_index/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

aws_dir = '1_pull_data/society_economy/economic_health/'
pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

In [None]:
gini_index_data = pd.read_csv('ACSDT5Y2022.B19083-Data.csv')
median_income_data = pd.read_csv('ACSST5Y2022.S1901-Data.csv')

In [None]:
county_tract = "s3://ca-climate-index/0_map_data/ca_tract_county_population_2021.csv"
ca_county_tract = pd.read_csv(county_tract)
ca_county_tract = ca_county_tract.rename(columns={'Census Tract': 'GEO_ID'})
ca_county_tract = ca_county_tract.drop(columns={'Unnamed: 0', 'COUNTYFP', 'County', 'Total Population 2021'})
ca_county_tract

### Gini Index Metric

In [None]:
gini_index_data

In [None]:
gini_index_data_cleaned = gini_index_data.drop(columns={'Unnamed: 4', 'B19083_001M', 'NAME'})
gini_index_data_cleaned = gini_index_data_cleaned.rename(columns={'B19083_001E':'est_gini_index'})
gini_index_data_cleaned = gini_index_data_cleaned[1:]
gini_index_data_cleaned['GEO_ID'] = gini_index_data_cleaned['GEO_ID'].str[10:]

gini_index_data_cleaned

In [None]:
# Convert the 'GEO_ID' column in ca_county_tract DataFrame to string data type
ca_county_tract['GEO_ID'] = ca_county_tract['GEO_ID'].astype(str)

gini_merged_data = pd.merge(gini_index_data_cleaned, ca_county_tract, on='GEO_ID', how='right')
gini_merged_data

In [9]:
gini_merged_data.to_csv('society_economy_gini_metric.csv')

### Median Income Metric

In [None]:
columns_to_keep = ['GEO_ID', 'S1901_C01_012E']

# Drop all columns except the specified ones
median_income_data_cleaned = median_income_data.drop(columns=[col for col in median_income_data.columns if col not in columns_to_keep])

median_income_data_cleaned = median_income_data_cleaned.rename(columns={'S1901_C01_012E':'est_median_income_dollars'})
median_income_data_cleaned = median_income_data_cleaned[1:]
median_income_data_cleaned['GEO_ID'] = median_income_data_cleaned['GEO_ID'].str[10:]
median_income_data_cleaned

In [None]:
# Convert the 'GEO_ID' column in ca_county_tract DataFrame to string data type
ca_county_tract['GEO_ID'] = ca_county_tract['GEO_ID'].astype(str)

# Now, you can proceed with merging
median_merged_data = pd.merge(median_income_data_cleaned, ca_county_tract, on='GEO_ID', how='right')

# View the merged DataFrame
median_merged_data

In [12]:
median_merged_data.to_csv('society_economy_median_income_metric.csv')

### Function call for both metrics

In [13]:
@append_metadata
def economy_metric(input_csv, export=False, varname=''):
    '''
    Calculates the Median income and Gini index data metrics and uploads to AWS.
    Data is sourced from United States Census Bureau: American Community Survey (ACS).
    Data codes: S1901 and B19083.

    Methods
    -------
    Rows without data were omitted, relevant columns were isolated and renamed.
    California census tract data was merged with the dataset based on tract.
    
    Parameters
    ----------
    input_csv: string
        csv economic data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI economy metrics to AWS
        True = will upload resulting df containing CAL CRAI economy metrics to AWS

    Script
    ------
    society_economic_metrics.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: non-data rows were removed.')
    print('Data transformation: data columns were renamed and isolated for metric relevancy.')
    print('Data transformation: data was merged with CA census tract data.')

    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/index_data'
    export_filename = [input_csv]
    
    if export == True:
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{export_filename} uploaded to AWS.')

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [32]:
input_csv = ['society_economy_gini_metric.csv',
            'society_economy_median_income_metric.csv'
            ]

varnames = [
    'society_acs_gini',
    'society_acs_income'
]

for csv, var in zip(input_csv, varnames):
    economy_metric(csv, export=True, varname=var)