# Cal-CRAI Domain Score: Governance

**Order of operations**: 
1) Metric handling \
   1a - Retrieve data \
   1b - Min-max standardization \
   1c - Set vulnerability orientation (positive for when a larger value represents greater vulnerability, negative for when a larger value corresponds to decreased vulnerability)

2) Calculate indicators \
   2a - Min-max standardization \
   2b - Finalize domain score
   
3) Visualize, save, and export domain score dataframe

In [None]:
import pandas as pd
import os
import sys
import warnings

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata
from scripts.utils.cal_crai_plotting import plot_domain_score, plot_region_domain
from scripts.utils.cal_crai_calculations import (handle_outliers, min_max_standardize, process_domain_csv_files, 
                                        compute_averaged_indicators, compute_summed_indicators, indicator_dicts, 
                                        add_census_tracts, domain_summary_stats)

## Step 1: Metric level
### 1a) Retrieve metric files and process

In [None]:
# set-up
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, output_folder='aws_csvs', search_zipped=False, print_name=False)

### Process and merge Governance metric files together

In [None]:
# domain-specific
domain_prefix = 'governance_'

input_folder = r'aws_csvs'
output_folder = domain_prefix + "folder"
meta_csv = r'../utils/calcrai_metrics.csv'
merged_output_file = f'concatenate_{domain_prefix}metrics.csv'

metric_vulnerable_resilient_dict = process_domain_csv_files(domain_prefix, input_folder, output_folder, meta_csv, merged_output_file)

### Take a look at the resulting dictionary
We will use this later to refactor 'resilient' metrics

In [None]:
metric_vulnerable_resilient_dict

### Now, take a look at the merged singluar csv file

In [None]:
# read-in and view processed data
processed_governance_df = pd.read_csv(merged_output_file)
processed_governance_df

### 1b) Min-max standardization
Metrics are min-max standardized on 0.01 to 0.99 scale.

In [None]:
# standardizing our df
columns_to_process = [col for col in processed_governance_df.columns if col != 'GEOID']
min_max_metrics = min_max_standardize(processed_governance_df, columns_to_process)

Isolate for GEOID and standardized columns exclusively

In [None]:
words = ['GEOID','standardized']
selected_columns = []
for word in words:
    selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(word)].tolist())
min_max_standardized_governance_metrics_df = min_max_metrics[selected_columns]

In [None]:
min_max_standardized_governance_metrics_df

### 1c) Set resilience orientation
* High values indicate resiliency
* Low values indicate vulnerablility

Some metrics indicate a communities vulnerablity rather than resilience. For example, 'percent_sampled_wells_contaminated' represents a communities vulnerability to water contamination. For this metric, the higher the number, the more vulnerable. So we identify these 'vulnerable' metrics with our `metric_vulnerable_resilient_dict` dictionary and subtract their values from 1 so all high values indicate resiliency

In [None]:
metric_vulnerable_resilient_dict

In [None]:
# Access the vulnerable column names from the dictionary
vulnerable_columns = metric_vulnerable_resilient_dict['vulnerable']

# Identify columns in the DataFrame that contain any of the vulnerable column names as substrings
vulnerable_columns_in_df = [col for col in min_max_standardized_governance_metrics_df.columns 
                           if any(resilient_col in col for resilient_col in vulnerable_columns)]

# Create a new DataFrame with the adjusted vulnerable columns
adjusted_vulnerable_df = min_max_standardized_governance_metrics_df.copy()

# Subtract the standardized vulnerable columns from one and store the result in the new DataFrame
adjusted_vulnerable_df.loc[:, vulnerable_columns_in_df] = (
    1 - adjusted_vulnerable_df.loc[:, vulnerable_columns_in_df]
)

In [None]:
adjusted_vulnerable_df

## Step 2: Calculate Indicators
Loop to go through df columns and average metrics that belong within an indicator based off of the metric to indicator dictionary

In [None]:
domain_prefix[:-1]

In [None]:
averaged_indicators_governance_systems = compute_averaged_indicators(
    adjusted_vulnerable_df, 
    indicator_dicts(domain_prefix[:-1])
)

# show resulting dataframe to highlight the indicator values
averaged_indicators_governance_systems

Save Indicator dataframe as a csv

In [None]:
# set-up file for export
indicator_filename = '{}domain_averaged_indicators.csv'.format(domain_prefix)
averaged_indicators_governance_systems.to_csv(indicator_filename, index=False)

Sum the indicator columns together to calculate the domain score
* essentially summing all columns except for 'GEOID'

In [None]:
columns_to_sum = [col for col in averaged_indicators_governance_systems.columns if col != 'GEOID']
summed_indicators_governance_systems = compute_summed_indicators(
    df=averaged_indicators_governance_systems, 
    columns_to_sum=columns_to_sum,
    domain_prefix=domain_prefix
)

# 2a) Min-max standardize the summed columns

In [None]:
columns_to_process = [col for col in summed_indicators_governance_systems.columns if col != 'GEOID']
min_max_domain = min_max_standardize(summed_indicators_governance_systems, columns_to_process)

In [None]:
min_max_domain

# 2b) Finalize domain score
* Isolate to census tract and summed standardized columns
* Rename tract to GEOID for merging
* Rename domain score column
* Add a zero at the beginning of the GEOID to match census tract that will be merged

In [None]:
governance_domain_score = min_max_domain[['GEOID', 'summed_indicators_governance_domain_min_max_standardized']].copy()

# GEOID handling
governance_domain_score['GEOID'] = governance_domain_score['GEOID'].apply(lambda x: '0' + str(x))
governance_domain_score['GEOID'] = governance_domain_score['GEOID'].astype(str).apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
governance_domain_score = governance_domain_score.rename(columns={'summed_indicators_governance_domain_min_max_standardized':'governance_domain_score'})

In [None]:
governance_domain_score

## Step 3: Visualize, save, and export domain score


Let's look at some summary statistics for this domain:

In [None]:
domain_summary_stats(governance_domain_score, 'governance_domain_score')

Now let's visualize the entire domain!

In [None]:
plot_domain_score(governance_domain_score, column_to_plot='governance_domain_score', domain=domain_prefix)

### We can also visualize specific areas!
We call from a the `plot_region_domain` function which has a number of variables:

df : DataFrame
        A DataFrame containing the data you want to plot, which must include the column 'GEOID' to match with the census tract data.
    
    counties_to_plot : list of str, optional
        A list of county FIPS codes (as strings) to plot. If None, no counties will be plotted.
        Example: ['037', '071', '065', '029', '111'].
    
    region : str, optional
        A predefined region to plot. Options: 'bay_area', 'central_region', 'inland_deserts', 'north_central', 'northern', or 'south_coast'.
        If specified, this will override `counties_to_plot`.
    
    plot_all : bool, optional
        If True, plots all counties in California. Overrides `counties_to_plot` and `region`.
    
    savefig : bool, optional
        If True, the plot will be saved as a PNG file. Default is False.
    
    font_color : str, optional
        Color of the font for county labels. Default is 'black'.
    
    domain : str, optional
        The domain name used for labeling and column names. Default is 'society_economy_'.
    
    domain_label_map : dict, optional
        A dictionary to map the domain variable to a more readable label. Example: {'society_economy_': 'Society and Economy Domain'}
    
    vmin : int, optional
        set the minimum bounds of the color gradient
        defualt is 0
    
    vmax : int, optional
        set the maximum bounds of the color gradient
        default is 3
    
    column_to_plot : str
        name of the column to be plotted
    
    cmap : str, optional
        name of the cmap to be used, 'Greens' is the default
    
    intro_title : str, optional
        first portion of the figure title, default is 'Domain Score'

    Dictionary of county labels
    county_labels = {
        '001': 'Alameda', '003': 'Alpine', '005': 'Amador', '007': 'Butte', '009': 'Calaveras',
        '011': 'Colusa', '013': 'Contra Costa', '015': 'Del Norte', '017': 'El Dorado', '019': 'Fresno',
        '021': 'Glenn', '023': 'Humboldt', '025': 'Imperial', '027': 'Inyo', '029': 'Kern',
        '031': 'Kings', '033': 'Lake', '035': 'Lassen', '037': 'Los Angeles', '039': 'Madera',
        '041': 'Marin', '043': 'Mariposa', '045': 'Mendocino', '047': 'Merced', '049': 'Modoc',
        '051': 'Mono', '053': 'Monterey', '055': 'Napa', '057': 'Nevada', '059': 'Orange',
        '061': 'Placer', '063': 'Plumas', '065': 'Riverside', '067': 'Sacramento', '069': 'San Benito',
        '071': 'San Bernardino', '073': 'San Diego', '075': 'San Francisco', '077': 'San Joaquin',
        '079': 'San Luis Obispo', '081': 'San Mateo', '083': 'Santa Barbara', '085': 'Santa Clara',
        '087': 'Santa Cruz', '089': 'Shasta', '091': 'Sierra', '093': 'Siskiyou', '095': 'Solano',
        '097': 'Sonoma', '099': 'Stanislaus', '101': 'Sutter', '103': 'Tehama', '105': 'Trinity',
        '107': 'Tulare', '109': 'Tuolumne', '111': 'Ventura', '113': 'Yolo', '115': 'Yuba'

You can plot a domains vulnerability index by region, specific county/counties, or the entirety of CA with labels
Below are a few example of each of these plotting scenarios

In [None]:
plot_region_domain(governance_domain_score, 
                   column_to_plot='governance_domain_score',
                   domain=domain_prefix, 
                   domain_label_map={domain_prefix: 'Governance Domain'}, 
                   region='inland_deserts', 
                   savefig=False, 
                   font_color='black')

In [None]:
plot_region_domain(governance_domain_score, 
                   column_to_plot='governance_domain_score',
                   domain=domain_prefix, 
                   domain_label_map={domain_prefix: 'Governance Domain'}, 
                   region='central_region', 
                   savefig=False, 
                   font_color='black')

In [None]:
list_of_counties = ['003']
plot_region_domain(governance_domain_score, 
                   column_to_plot='governance_domain_score',
                   domain=domain_prefix, 
                   domain_label_map={domain_prefix: 'Governance Domain'}, 
                   counties_to_plot=list_of_counties, 
                   savefig=False, 
                   font_color='black')

In [None]:
plot_region_domain(governance_domain_score, 
                   column_to_plot='governance_domain_score',
                   domain=domain_prefix, 
                   domain_label_map={domain_prefix: 'Governance Domain'}, 
                   plot_all=True, 
                   savefig=False, 
                   font_color='black')

## Export the final domain csv file

In [None]:
# set-up file for export
domain_filename = '{}domain_score.csv'.format(domain_prefix)
governance_domain_score.to_csv(domain_filename, index=False)

## Upload the indicator and domain score csv files to AWS

In [None]:
# upload to aws bucket
bucket_name = 'ca-climate-index'
directory = '3_fair_data/index_data'

files_upload = indicator_filename, domain_filename

for file in files_upload:
    upload_csv_aws([file], bucket_name, directory)