## Analyze CCI project climate risk columns

This notebook pulls the resulting .csv file from 'cci_crosswalk_indicators_climate_mitigation.ipynb' and subsets for data with populated 'Climate_Risk_Mitigation' column entries. We further subset the data to view the 'Climate Adaptation' column that is native to the original CCI dataset. 

Individual climate risks from the 'Climate_Risk_Mitigation' column are utilzed to split the data into five respective dataframes for further future analysis specific to that risk.

Repeating projects are grouped together within these dataframes and financial columns are summed and grouped in decending order.

In [16]:
# Import useful libraries
import os
import boto3
import pandas as pd
import itertools
import re
from IPython.display import display, HTML

In [None]:
# Initialize the S3 client
s3_client = boto3.client('s3')

# Bucket name and file paths
bucket_name = 'ca-climate-index'
directory = '0_map_data/crosswalk_data/final_cci_project_indicators_and_climate_risk_with_contextual_columns.csv'

print('Pulling file')
s3_client.download_file(bucket_name, directory, 'final_cci_project_indicators_and_climate_risk_with_contextual_columns.csv')
print('File pulled')

In [None]:
crosswalk_data = pd.read_csv('final_cci_project_indicators_and_climate_risk_with_contextual_columns.csv')

In [None]:
crosswalk_data

### Subset for rows with non-nan entries within the 'Columns_Detected_Climate_Risk' column

In [5]:
# Using `notna()` to keep rows where `column_name` is not NaN
climate_risk_crosswalk_data = crosswalk_data[crosswalk_data['Climate_Risk_Mitigation'].notna()]

In [None]:
climate_risk_crosswalk_data.head()

In [None]:
list(climate_risk_crosswalk_data.columns)

In [None]:
keep_columns = [
    'Total Project Cost',
    'Project Count',
    'SECTOR', 
    'CATEGORY', 
    'ACTION', 
    'Project Type',
    'Sub Program Name',
    'Other Project Benefits Description', 
    'Disadvantaged Community Benefits Description',
    'Is Low Income Communities',
    'Climate  Adaptation', 
    'Climate_Risk_Mitigation'
]

subset_crosswalk_data = climate_risk_crosswalk_data[keep_columns]
print(len(subset_crosswalk_data))
subset_crosswalk_data.head()

In [None]:
def count_entries(dataframe):
    non_nan_entries = dataframe.notna().sum()     
    unique_entries = dataframe.nunique()
    return non_nan_entries, unique_entries

non_nan_entries, unique_entries = count_entries(subset_crosswalk_data)
print("Total number of non-nan entries per column:")
print(non_nan_entries)
print("\nUnique entries per column:")
print(unique_entries)

In [None]:
# Split the Climate_Risk_Mitigation column by a delimiter (e.g., comma) and explode the DataFrame
subset_crosswalk_data_expanded = subset_crosswalk_data.assign(
    Climate_Risk_Mitigation=subset_crosswalk_data['Climate_Risk_Mitigation'].str.split(',')
).explode('Climate_Risk_Mitigation')

# Optionally, strip whitespace from the split entries
subset_crosswalk_data_expanded['Climate_Risk_Mitigation'] = subset_crosswalk_data_expanded['Climate_Risk_Mitigation'].str.strip()

# Now you can split the expanded DataFrame into separate DataFrames by unique mitigation type, as before
split_dfs = {
    mitigation: subset_crosswalk_data_expanded[subset_crosswalk_data_expanded['Climate_Risk_Mitigation'] == mitigation]
    for mitigation in subset_crosswalk_data_expanded['Climate_Risk_Mitigation'].unique()
}

split_dfs


In [None]:
split_dfs['wildfire mitigation']


In [None]:
# Dictionary to store the grouped DataFrames
grouped_dfs = {}
total_project_counts = {}

# Loop through each DataFrame in split_dfs
for mitigation_type, df in split_dfs.items():
    # Group and aggregate
    grouped_df = (
        df.groupby(['SECTOR', 'CATEGORY', 'ACTION', 'Project Type'], as_index=False)
        .agg({
            'Project Count': 'sum',
            'Total Project Cost': 'sum',
            'Other Project Benefits Description': lambda x: '; '.join(x.dropna().value_counts().index[:2]),
            'Disadvantaged Community Benefits Description': lambda x: '; '.join(x.dropna().value_counts().index[:2]),
            'Is Low Income Communities': lambda x: (x.sum() / len(x)) * 100,  # Calculate percentage of 1's
            'Climate  Adaptation': lambda x: '; '.join(x.dropna().unique())
        })
    )

    # Sort by Total Project Cost in descending order
    grouped_df = grouped_df.sort_values(by='Total Project Cost', ascending=False)
    grouped_df = grouped_df.rename(columns={'Is Low Income Communities':'Percentage of Projects Low Income Communities'})


    # Calculate the total project count with zeros counted as one
    total_project_count = grouped_df['Project Count'].apply(lambda x: 1 if x == 0 else x).sum()
    
    # Store the grouped DataFrame and total project count
    grouped_dfs[mitigation_type] = grouped_df
    total_project_counts[mitigation_type] = total_project_count

# Output the total project counts for each mitigation type
for mitigation_type, count in total_project_counts.items():
    print(f"Number of total projects for {mitigation_type}: {count}")

# grouped_dfs now contains the processed DataFrames for each mitigation type

In [65]:
crosswalk_slr = grouped_dfs['sea level rise mitigation']
crosswalk_inland_flodding = grouped_dfs['inland flooding mitigation']
crosswalk_drought = grouped_dfs['drought mitigation']
crosswalk_wildfire = grouped_dfs['wildfire mitigation']
crosswalk_extreme_heat = grouped_dfs['extreme heat mitigation']

In [66]:
crosswalk_wildfire.to_csv('crosswalk_wildfire.csv', index=False)

In [None]:
crosswalk_wildfire