## Individually plot all 110 Cal-CRAI metric files
Step 1) Pull, clean, and merge data
* each metric is similarly processed as they are for each domain index
    * grouped together by shared GEOID column
    * cleaned for infinite values, obsolete island tract, NaN GEOID values

Step 2) Process data
* min-max standardize each metric column
* perform vulnerability adjustment
    * isolate 'vulnerable high' metrics
    * divide 1 by their values so they are 'resilience high' (high values are more resilient)

Step 3) Plot each metric
* each plot has its full metric name as its title
* 0-1 resilience scale, higher values are more resilient
* consistent color scheme
* all resulting .png files are stored in a single folder

In [None]:
import pandas as pd
import os
import sys
import glob
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import box
import re  # For regex operations
import textwrap

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory
from scripts.utils.calculate_index import (min_max_standardize, 
                                        add_census_tracts)
pd.set_option('display.max_columns', 500)

## Step 1) Pull, clean, and merge data

### Pull

In [None]:
# pull all metric files
bucket_name = 'ca-climate-index'
aws_dir = '3_fair_data/index_data/'

pull_csv_from_directory(bucket_name, aws_dir, output_folder='aws_csvs', search_zipped=False, print_name=False)

### Merge & Clean

In [None]:
def process_all_csv_files(input_folder, output_folder, meta_csv, merged_output_file):
    '''
    Pulls all metric CSV files from the input folder, merges all together based on shared GEOID column. 
    NaN values within the GEOID column are removed, and infinite values (if any) in other columns are adjusted to NaN values.
    Lastly, an uninhabited island tract is also given NaN metric values.
    
    Parameters
    ----------
    input_folder: str
        Name of the folder that is storing all metric csv files
    output_folder: str
        Name of the folder to store pulled domain specific csv files.
    meta_csv: str
        Local path to the metadata pipeline.
    merged_output_file: str
        Desired name of merged output csv file.
    '''

    # Function to detect incremental columns
    def is_incremental(series):
        """Check if a column is incremental (starts from 0 or 1 and increases by 1)."""
        if series.dtype in [np.int64, np.float64]:  # Ensure numeric columns
            diff = series.diff().dropna()  # Calculate the difference between consecutive values
            return (diff == 1).all() and series.iloc[0] in [0, 1]
        return False

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Load the metadata CSV
    df = pd.read_csv(meta_csv)

    # Get the list of metric file names and corresponding 'Metric' and 'High value result (vulnerable or resilient)' entries
    metric_files = df[['Metric file name', 'Metric', 'High value result (vulnerable or resilient)']]

    # Dictionary to hold the metric and corresponding last column from the CSV files
    metric_last_column_dict = {}

    # Second dictionary for resilience and vulnerability grouping
    resilience_vulnerability_dict = {'resilient': [], 'vulnerable': []}

    # Find all CSV files and match to their corresponding metric file names
    source_files = [file for file in glob.glob(os.path.join(input_folder, '*.csv'))
                    if os.path.basename(file) in metric_files['Metric file name'].values]

    # Iterate through the source files and process them
    for file in source_files:
        # Get the 'Metric' entry for the current file
        metric_info = metric_files.loc[metric_files['Metric file name'] == os.path.basename(file)]
        metric_name = metric_info['Metric'].values[0]
        high_value_result = metric_info['High value result (vulnerable or resilient)'].values[0]

        # Load the CSV file
        csv_df = pd.read_csv(file)

        # Detect and remove incremental columns
        incremental_cols = [col for col in csv_df.columns if is_incremental(csv_df[col])]
        csv_df.drop(columns=incremental_cols, inplace=True)

        # Get the last column name
        last_column = csv_df.columns[-1]

        # Add the metric name and last column name to the dictionary
        metric_last_column_dict[metric_name] = last_column

        # Add the metric name to the appropriate resilience or vulnerability group
        if high_value_result.lower() == 'resilient':
            resilience_vulnerability_dict['resilient'].append(metric_name)
        elif high_value_result.lower() == 'vulnerable':
            resilience_vulnerability_dict['vulnerable'].append(metric_name)

        # Construct the destination file path
        destination_path = os.path.join(output_folder, os.path.basename(file))

        # Save the modified CSV to the output folder
        csv_df.to_csv(destination_path, index=False)

        # Remove the original file
        os.remove(file)

    print(f"Processed and saved {len(source_files)} CSV files within all domains.")
    print('\nMetric dictionary created and called: metric_last_column_dict')

    # --- Additional Processing: Merging CSV Files ---

    # Get a list of all CSV files in the output folder
    csv_files = glob.glob(os.path.join(output_folder, '*.csv'))

    # Initialize an empty DataFrame for merging
    merged_df = pd.DataFrame()

    # Iterate through each CSV file and merge them on the 'GEOID' column
    for file in csv_files:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
        
        # Rename 'GEO_ID', 'tract', 'TRACT', 'Census_Tract', 'GEOID', 'USCB_GEOID' to 'GEOID' if they exist
        rename_cols = ['GEO_ID', 'GEOID', 'tract', 'TRACT', 'Census_Tract', 'census_tract', 'USCB_GEOID', 'Unnamed: 0']
        for col in rename_cols:
            if col in df.columns:
                df.rename(columns={col: 'GEOID'}, inplace=True)
                break
         
        # Keep only the 'GEOID' and the last column from each file
        last_column = df.columns[-1]
        df = df[['GEOID', last_column]]
        
        # Merge the DataFrame with the existing merged DataFrame
        if merged_df.empty:
            merged_df = df
        else:
            merged_df = pd.merge(merged_df, df, on='GEOID', how='outer')

    # Drop rows where 'GEOID' is NaN
    merged_df = merged_df.dropna(subset=['GEOID'])

    # Convert census tract to string and eliminate scientific notation default
    merged_df['GEOID'] = merged_df['GEOID'].dropna().apply(lambda x: '{:.0f}'.format(x))

    # Convert all values within the island tract (near San Francisco) to NaN, as it is uninhabited 
    island_tract = '6075980401'
    merged_df.loc[merged_df['GEOID'] == island_tract, merged_df.columns != 'GEOID'] = np.nan

    # Check if all entries within the island tract are NaN
    island_row = merged_df.loc[merged_df['GEOID'] == island_tract]
    if island_row.iloc[:, 1:].isnull().all().all():
        print(f"\nAll entries within the island tract ({island_tract}) are NaN.")
    else:
        print(f"\nSome entries within the island tract ({island_tract}) are not NaN.")

    merged_df['GEOID'] = merged_df['GEOID'].apply(lambda x: '0' + str(x))
    merged_df['GEOID'] = merged_df['GEOID'].astype(str).apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)

    # Selecting only numeric columns
    numeric_df = merged_df.select_dtypes(include=[np.number])

    # Counting infinite values
    num_infinite = np.isinf(numeric_df).sum().sum()

    print(f"\nNumber of infinite entries in the DataFrame: {num_infinite}")
    print('Replacing infinite entries (if any) with NaN')

    # Replace infinite values with NaN
    merged_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Counting infinite values after replacement
    num_infinite = np.isinf(numeric_df).sum().sum()
    print(f"Number of infinite entries in the DataFrame after replacement: {num_infinite}")

    print(f"\nFile processing complete, dataframe will now be saved as a .csv")
    
    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(merged_output_file, index=False)

    print(f"Processed CSV saved as {merged_output_file}")
    
    return metric_last_column_dict, resilience_vulnerability_dict


In [None]:
# pull all domain csv metric files and process them
input_folder = r'aws_csvs'
output_folder = "output_folder"
meta_csv = r'../utils/calcrai_metrics.csv'

merged_output_file = 'all_domain_files.csv'
metric_last_column_dict, resilience_vulnerability_dict = process_all_csv_files(input_folder, output_folder, meta_csv, merged_output_file)

In [None]:
all_metrics = pd.read_csv('all_domain_files.csv')

In [None]:
all_metrics

## Step 2) Process data

### Min-max standardize each metric columns

In [None]:
# min max standardize all non-GEOID columns in our df
columns_to_process = [col for col in all_metrics.columns if col != 'GEOID']
min_max_metrics = min_max_standardize(all_metrics, columns_to_process)

In [None]:
# List of words to filter columns
words = ['GEOID', 'standardized']

# Initialize an empty list to hold the selected columns
selected_columns = []

# Iterate over the words
for word in words:
    # If the word is 'standardized', use a regular expression to match only when it appears at the end of a string
    if word == 'standardized':
        selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(r'standardized$', regex=True)].tolist())
    else:
        # For other words, use a normal contains check
        selected_columns.extend(min_max_metrics.columns[min_max_metrics.columns.str.contains(word)].tolist())

# Create the filtered DataFrame with the selected columns
min_max_standardized_all_metrics_df = min_max_metrics[selected_columns]


In [None]:
min_max_standardized_all_metrics_df

In [None]:
# Ensure 'GEOID' is treated as a string before modifying
min_max_standardized_all_metrics_df.loc[:, 'GEOID'] = min_max_standardized_all_metrics_df['GEOID'].astype(str)

# Add leading zero to 'GEOID' and strip trailing zeros and decimal points
min_max_standardized_all_metrics_df.loc[:, 'GEOID'] = min_max_standardized_all_metrics_df['GEOID'].apply(lambda x: '0' + str(x))
min_max_standardized_all_metrics_df.loc[:, 'GEOID'] = min_max_standardized_all_metrics_df['GEOID'].astype(str).apply(lambda x: x.rstrip('0').rstrip('.') if '.' in x else x)
min_max_standardized_all_metrics_df.head()

In [None]:
#resilience_vulnerability_dict

### Vulnerability adjustment

In [None]:
# Adjust Vulnerable Columns
vulnerable_metric_names = resilience_vulnerability_dict['vulnerable']

# Get the actual column names corresponding to resilient metrics
vulnerable_columns_in_df = [
    f"{metric_last_column_dict[metric_name]}_min_max_standardized"
    for metric_name in vulnerable_metric_names if metric_name in metric_last_column_dict
]

# Adjust the resilient columns in the original DataFrame
adjusted_vulnerable_df = min_max_standardized_all_metrics_df.copy()

for column_name in vulnerable_columns_in_df:
    # Check if the column exists in the DataFrame
    if column_name in adjusted_vulnerable_df.columns:
        # Subtract from 1 to adjust the values
        adjusted_vulnerable_df[column_name] = 1 - adjusted_vulnerable_df[column_name]
    else:
        print(f"Column '{column_name}' not found in DataFrame for adjustment.")

In [None]:
adjusted_vulnerable_df

### Rename all columns to their metric name

In [None]:
# Rename all metric columns
rename_mapping_all = {
    f"{v}_min_max_standardized": k 
    for k, v in metric_last_column_dict.items() 
    if f"{v}_min_max_standardized" in adjusted_vulnerable_df.columns
}

# Rename the columns in the DataFrame
adjusted_vulnerablility_renamed_df = adjusted_vulnerable_df.rename(columns=rename_mapping_all)
adjusted_vulnerablility_renamed_df.head()

## Step 3) Visualize

### Merge the df and census tracts and convert the geometry to our uniformly used coordinate reference system (4269)

In [None]:
gdf = add_census_tracts(adjusted_vulnerablility_renamed_df)

## Function to plot metrics individually

In [None]:
def plot_region_individual_metrics(gdf, counties_to_plot=None, region=None, plot_all=False, savefig=False, font_color='black', domain='society_economy_', domain_label_map=None, output_folder='output_plots'):
    """
    Plots a domain score vulnerability for selected counties or regions, with the option to exclude features within a bounding box.
    
    This version also iterates over columns in the GeoDataFrame that contain the word 'standardized' and saves each plot.

    Parameters:
    -----------
    gdf : GeoDataFrame
        A GeoDataFrame containing the data you want to plot, which must include the column 'GEOID' to match with the census tract data.
    
    counties_to_plot : list of str, optional
        A list of county FIPS codes (as strings) to plot. If None, no counties will be plotted.
        Example: ['037', '071', '065', '029', '111'].
    
    region : str, optional
        A predefined region to plot. Options: 'bay_area', 'central_region', 'inland_deserts', 'north_central', 'northern', or 'south_coast'.
        If specified, this will override `counties_to_plot`.
    
    plot_all : bool, optional
        If True, plots all counties in California. Overrides `counties_to_plot` and `region`.
    
    savefig : bool, optional
        If True, the plot will be saved as a PNG file. Default is False.

    font_color : str, optional
        Color of the font for county labels. Default is 'black'.

    domain : str, optional
        The domain name used for labeling and column names. Default is 'society_economy_'.

    domain_label_map : dict, optional
        A dictionary to map the domain variable to a more readable label. Example: {'society_economy_': 'Society and Economy Domain'}

    output_folder : str, optional
        The folder where the plot files should be saved. Default is 'output_plots'.

    Returns:
    --------
    None
        Iterates through standardized columns and saves the plots in the specified folder.
    """
    
    # If a domain label map is provided, use it to get a readable title. Otherwise, create it from the domain string.
    if domain_label_map:
        domain_name = domain_label_map.get(domain, domain.replace('_', ' ').title())
    else:
        domain_name = domain.replace('_', ' ').title()

    # Dictionary of county labels
    county_labels = {
        '001': 'Alameda', '003': 'Alpine', '005': 'Amador', '007': 'Butte', '009': 'Calaveras',
        '011': 'Colusa', '013': 'Contra Costa', '015': 'Del Norte', '017': 'El Dorado', '019': 'Fresno',
        '021': 'Glenn', '023': 'Humboldt', '025': 'Imperial', '027': 'Inyo', '029': 'Kern',
        '031': 'Kings', '033': 'Lake', '035': 'Lassen', '037': 'Los Angeles', '039': 'Madera',
        '041': 'Marin', '043': 'Mariposa', '045': 'Mendocino', '047': 'Merced', '049': 'Modoc',
        '051': 'Mono', '053': 'Monterey', '055': 'Napa', '057': 'Nevada', '059': 'Orange',
        '061': 'Placer', '063': 'Plumas', '065': 'Riverside', '067': 'Sacramento', '069': 'San Benito',
        '071': 'San Bernardino', '073': 'San Diego', '075': 'San Francisco', '077': 'San Joaquin',
        '079': 'San Luis Obispo', '081': 'San Mateo', '083': 'Santa Barbara', '085': 'Santa Clara',
        '087': 'Santa Cruz', '089': 'Shasta', '091': 'Sierra', '093': 'Siskiyou', '095': 'Solano',
        '097': 'Sonoma', '099': 'Stanislaus', '101': 'Sutter', '103': 'Tehama', '105': 'Trinity',
        '107': 'Tulare', '109': 'Tuolumne', '111': 'Ventura', '113': 'Yolo', '115': 'Yuba'
    }

    # Define the new regional groups of counties
    regions = {
        'bay_area': ['001', '013', '041', '055', '081', '085', '087', '075', '095', '097'],
        'central_region': ['019', '029', '031', '039', '043', '047', '053', '069', '079', '099', '107', '109'],
        'inland_deserts': ['025', '027', '051', '065', '071'],
        'north_central': ['067', '077', '017', '033', '057', '061', '091', '101', '063', '113', '115'],
        'northern': ['015', '023', '035', '045', '049', '093', '089', '103', '105'],
        'south_coast': ['037', '059', '073', '083', '111']
    }

    # Set counties_to_plot based on the specified region or plot_all flag
    if plot_all:
        counties_to_plot = list(county_labels.keys())
        title_prefix = f'Resiliency Index of All Counties in California \n'
    elif region:
        counties_to_plot = regions.get(region, [])
        region_name = region.replace('_', ' ').title()  # Capitalize the region name for display
        title_prefix = f'Resiliency Index of California\'s {region_name} \n'
    else:
        title_prefix = f'Resiliency Index of Selected Counties \n'

    # Convert to GeoDataFrame with the correct CRS if necessary
    df2_filtered = gpd.GeoDataFrame(gdf, geometry='geometry', crs=4269)

    # Define the bounding box to exclude (xmin, ymin, xmax, ymax)
    exclusion_box = box(-122.8, 37.6, -123.2, 37.85) 
    
    # Exclude features within the bounding box
    df2_filtered = df2_filtered[~df2_filtered.intersects(exclusion_box)]

    # Check for invalid geometries
    invalid_geometries = df2_filtered[~df2_filtered['geometry'].is_valid]
    print("Number of invalid geometries:", len(invalid_geometries))

    # Group by COUNTYFP and take the geometry of the first row in each group
    county_boundaries = df2_filtered.dissolve(by='COUNTYFP')['geometry']

    # Check if there are any valid geometries left after filtering
    if len(county_boundaries) == 0:
        print('No valid geometries. Cannot plot.')
        return

    # Create the output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Define columns to exclude
    excluded_columns = [
        'GEOID', 'STATEFP', 'COUNTYFP', 'TRACTCE', 
        'NAME', 'NAMELSAD', 'MTFCC', 'FUNCSTAT', 
        'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'
    ]

    # Function to sanitize column names
    def sanitize_column_name(column_name):
        # Replace specific characters with desired words
        column_name = column_name.replace('%', 'percent')
        column_name = column_name.replace('#', 'number')
        column_name = column_name.replace('<', 'less than')
        column_name = column_name.replace('>', 'greater than')
        # Replace other invalid characters with '_'
        return re.sub(r'[<>:"/\\|?*]', '_', column_name)

    # Iterate over each column in the gdf that is not in the excluded list
    for column in gdf.columns:
        if column not in excluded_columns:
            print(f"Plotting for column: {column}")

            # Create the plot
            fig, ax = plt.subplots(1, 1, figsize=(10, 10))
            df2_filtered.plot(column=column, 
                              ax=ax, 
                              vmin=0, vmax=1, 
                              legend=True, 
                              cmap='Purples', 
                              missing_kwds={
                                  "color": "grey",
                                  "label" : "Missing values"
                              },
                              legend_kwds={'label': 'Resilience (larger values are more resilient)', 
                                           'orientation': 'horizontal', 'shrink': 0.5, 'pad': 0.05}
                             )

            # Plot county boundaries
            county_boundaries.boundary.plot(ax=ax, linewidth=0.55, edgecolor='black')

            # Set the plot title using the current column name
            title = f"{title_prefix} {column.replace('_', ' ').title()}"

            max_title_length = 46
            if len(title) > max_title_length:
                # Use textwrap to avoid splitting words
                wrapped_title = "\n".join(textwrap.wrap(title, width=max_title_length, break_long_words=False))
                title = wrapped_title

            ax.set_title(title, fontsize=13)

            # Sanitize the column name for the filename
            sanitized_column_name = sanitize_column_name(column)

            # Optionally save the figure
            if savefig:
                file_name = f"{sanitized_column_name}_plot.png"
                output_path = os.path.join(output_folder, file_name)
                plt.savefig(output_path, dpi=300)
                print(f"Plot saved: {output_path}")

            # Display the plot
            plt.show()

    print(f"All plots have been saved to {output_folder}")


## Plot all metrics

In [None]:
plot_region_individual_metrics(gdf, domain='Individual Metric', counties_to_plot=None, region=None, plot_all=True, savefig=True, output_folder='plots_directory')

In [None]:
# Specify the path to your output folder
output_folder = ''

# Count the number of files in the directory
file_count = len([f for f in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, f))])

# Print the result
print(f"Number of files in the output folder: {file_count}")