In [1]:
import os
import boto3
import pandas as pd

In [2]:
# Initialize the S3 client
s3_client = boto3.client('s3')

# Bucket name and file paths
bucket_name = 'ca-climate-index'
directory = '0_map_data/crosswalk_data/CCI_Projects_Project_Category_Update_02142024.xlsm'

print('Pulling file')
s3_client.download_file(bucket_name, directory, 'CCI_Projects_Project_Category_Update_02142024.xlsm')
print('File pulled')

Pulling file


In [None]:
crosswalk_data = pd.read_excel('CCI_Projects_Project_Category_Update_02142024.xlsm')

## How many rows within the original dataset?

In [None]:
pd.set_option('display.max_columns', None)
print('Number of rows within dataset:', len(crosswalk_data))
display(crosswalk_data)

## Display all columns

In [None]:
print('Number of columns:', len(crosswalk_data.columns.tolist()))
display(crosswalk_data.columns.tolist())

## Selecting columns relevant to initial analysis

In [None]:
relevant_columns = [
    'Program Name',
    'Program Description',
    'Sub Program Name',
    'Project Type',
    'Project Description',
    'SECTOR',
    'CATEGORY',
    'ACTION',
    'Census Tract',
    'Total Project GHGReductions',
    'Project Count'
]

In [None]:
data_of_interest = crosswalk_data[relevant_columns]

In [None]:
# Set display options to show all columns and rows
# pd.set_option('display.max_columns', None)  # To display all columns
# pd.set_option('display.max_rows', None)     # To display all rows

# Now display data_of_interest
display(data_of_interest)

## Create a climate risk dictionary to scan through data based on dictionary values

In [None]:
climate_risk_dict = {
    'wildfire mitigation': ['wildfire', 'prescribed fire', 'fire prevention', 'burn', 'controlled burn', 'controlled_burning', 
                            'prescribed burn', 'prescribed burning' 'firefighting', 'reforest', 'reforestation', 'vegetation management', 
                            'roadside brushing', 'fuel break', 'fuel reduction', 'ignition', 'crown', 'fuel load', 'Fire and Forest Management'],
    
    'sea level rise mitigation': ['sea level rise', 'slr', 'seawall', 'seawalls', 'shoreline', 'wetland', 'mangrove', 'coastal'],
    
    'extreme heat mitigation': ['extreme heat', 'shade', 'shading', 'cooling center', 'cooling centers', 'heat-resistant', 
                                'heat resistant', 'heat reducing', 'heat-reducing'],
    
    'drought mitigation': ['drought', 'irrigation', 'soil moisture', 'rainwater harvest', 'rainwater harvesting', 'water storage', 
                           'water allocation', 'water management', 'soil health', 'soil management', 'organic matter', 'water efficiency'],
    
    'inland flooding mitigation': ['flooding', 'runoff', 'inland flood', 'inland flooding', 'floodplain', 'flood proof', 'floodproofing', 
                                   'elevated flood', 'flood barrier', 'flood barriers', 'drainage', 'riparian', 'stormwater'],
    
    'greenhouse gas mitigation': ['ghg', 'GHG', 'greenhouse gas', 'emission', 'emissions', 'carbon sequestration', 'electrification', 
                                  'carbon capture', 'solar power', 'renewable energy', 'wind energy', 'hydroelectricity', 'geothermal energy', 
                                  'biomass energy', 'Energy-efficiency']
}

## How many total and unique entries for each column? Will help decide which column to start with

In [None]:
def count_entries(dataframe):
    total_entries = dataframe.count()
    unique_entries = dataframe.nunique()
    return total_entries, unique_entries

# Example usage:
total_entries, unique_entries = count_entries(data_of_interest)
print("Total entries per column:")
print(total_entries)
print("\nUnique entries per column:")
print(unique_entries)


## Deciding to loop through 'Project Description' first as it has a large number of variation to capture many datasets, and makes most practical scense for filtering climate keywords

In [None]:
def add_climate_risk_column(df, keyword_dict, output_csv=None):
    # Initialize new columns to store climate risk mitigation keywords, detected values, repeat counts, and total unique descriptions
    df['Climate_Risk_Mitigation'] = ''
    df['Detected_Climate_Risk_Mitigation_Keyword'] = ''
    df['Repeat_Project_Description_Count'] = 0

    # Initialize a counter for each keyword
    keyword_counter = {keyword: 0 for keyword in keyword_dict}

    # Create a dictionary to store the repeat count for each unique project description
    description_counts = {}

    # Create a dictionary to store the unique count for each keyword
    unique_keyword_counts = {keyword: set() for keyword in keyword_dict}

    # Iterate through each unique description
    unique_descriptions = df['Project Description'].unique()
    total_unique_descriptions = len(unique_descriptions)

    for description in unique_descriptions:
        # Find all rows with this description
        description_rows = df[df['Project Description'] == description]
        repeat_count = len(description_rows)
        # Update the repeat count for this description
        description_counts[description] = repeat_count

        # Iterate through each row with this description
        for index, row in description_rows.iterrows():
            keywords_found = set()  # To store unique keywords found in each row
            detected_values = []    # To store the detected values for each row
            # Iterate through each keyword in the dictionary
            for keyword, values in keyword_dict.items():
                # Check if any value of the keyword is present in the description (case-insensitive)
                detected = [val for val in values if val.lower() in description.lower()]
                if detected:
                    keywords_found.add(keyword)
                    keyword_counter[keyword] += 1
                    detected_values.extend(detected)
                    # Add the description to unique count for this keyword
                    unique_keyword_counts[keyword].add(description)

            # Update the 'Climate_Risk_Mitigation' column with unique keywords found
            df.at[index, 'Climate_Risk_Mitigation'] = ', '.join(keywords_found)
            # Update the 'Detected_Values' column with detected values
            df.at[index, 'Detected_Climate_Risk_Mitigation_Keyword'] = ', '.join(detected_values)
            # Update the 'Repeat_Project_Description_Count' column with the repeat count for this description
            df.at[index, 'Repeat_Project_Description_Count'] = repeat_count

    # Print keyword counts
    print("Keyword Counts:")
    for keyword, count in keyword_counter.items():
        print(f"{keyword}: {count}")
    print('')
    # Print total unique descriptions count
    print(f"Total Unique Project Descriptions: {total_unique_descriptions}")
    print('')

    # Print unique counts from each keyword
    print("Unique Counts from Each Keyword:")
    for keyword, unique_count in unique_keyword_counts.items():
        print(f"{keyword}: {len(unique_count)}")
        print('')
    # Save DataFrame as CSV if output_csv is provided
    if output_csv:
        df.to_csv(output_csv, index=False)
        print(f"DataFrame saved as {output_csv}")
        print('')
        # Initialize the S3 client
        s3_client = boto3.client('s3')

        # Bucket name and file paths
        bucket_name = 'ca-climate-index'
        directory = f'0_map_data/crosswalk_data/{output_csv}'
        # Upload the CSV file to S3
        print(f'Uploading {output_csv} to AWS')
        with open(output_csv, 'rb') as file:
            s3_client.upload_fileobj(file, bucket_name, directory)
            print(f'Upload complete! File is in {directory}')

## Testing function on whole dataset, the function will:
- loop through each 'Project Description' and look for words/phrases in our climate risk dictionary and append to keyword counter
- total keywords are counted
- number of unique 'Project Description' entries are counted, so are climate risk keys associate with those unique entries
- makes two new columns: 'Repeat Project Description Count' and 'Detected Climate Risk Mitigation Keyword' to add more context and improve dictionary keywords

## The cell below runs the function but also adds a few things:
- makes a data preview, just selecting relevant columns that were made and help interpret Project Description screening results
- orders the data in decending order from the Repeat Project Description Count to show Project Descriptions with multiple entries first (make sure dictionary is properly assigning large entries with correct climate risk)

In [None]:
add_climate_risk_column(crosswalk_data, climate_risk_dict) #, 'climate_risk_attributed_crosswalk_data.csv')
pd.set_option('display.max_colwidth', None)
data_preview = crosswalk_data[['Project Description', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]

# Filter the DataFrame to show only rows with entries in the 'Climate_Risk_Mitigation' column
data_preview_filtered = data_preview[data_preview['Climate_Risk_Mitigation'] != '']

# Filter the DataFrame to show only rows with entries in the 'Climate_Risk_Mitigation' column
data_preview_filtered = data_preview[data_preview['Climate_Risk_Mitigation'] != '']

# Sort the DataFrame based on 'Repeat_Project_Description_Count' in descending order
data_preview_filtered_sorted = data_preview_filtered.sort_values(by='Repeat_Project_Description_Count', ascending=False)

# Drop duplicates based on both 'Repeat_Project_Description_Count' and 'Project Description' to keep only one row per unique combination
data_preview_filtered_unique = data_preview_filtered_sorted.drop_duplicates(subset=['Repeat_Project_Description_Count', 'Project Description'])

#display(data_preview_filtered_unique)
display(data_preview_filtered_unique[:10])

In [None]:
crosswalk_data
crosswalk_data_sample = crosswalk_data.sample(n=100)
# crosswalk_data_sample.head(5)

---
Adding in tests to understand where more than one risk is assigned
- If 2 are provided, but one is GHG --> assign the category to the associated climate risk (i.e., "greenhouse gas mitigation, sea level rise mitigation" should end up as "sea level rise mitigation")
   - 654 instances
- If 2+ climate risks are assigned, need manual intervention to identify climate risk to be final assigned
   - Strip out all instances of "greenhouse gas mitigation" to reduce # of manual intervention
   - Identify the "main" or "priority" risk denoted in the project description

In [None]:
# multi_risk = crosswalk_data.loc[crosswalk_data['Climate_Risk_Mitigation'].str.contains('greenhouse gas mitigation, wildfire mitigation')]
multi_risk = crosswalk_data.loc[(crosswalk_data['Climate_Risk_Mitigation'].str.count(',') == 1) & (crosswalk_data['Climate_Risk_Mitigation'].str.contains('greenhouse gas mitigation'))]
# clean-up view for easier access
data_preview = multi_risk[['Project Description', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]

print(len(data_preview))
display(data_preview)

## Eliminating 'greenhouse gas mitigation' entries when other climate risks present

In [None]:
# Create a copy of the DataFrame to avoid modifying the original data
crosswalk_data_copy = crosswalk_data.copy()

# Filter rows containing 'greenhouse gas mitigation'
multi_risk = crosswalk_data_copy.loc[(crosswalk_data_copy['Climate_Risk_Mitigation'].str.count(',') == 1) & 
                                (crosswalk_data_copy['Climate_Risk_Mitigation'].str.contains('greenhouse gas mitigation'))]

# Replace 'greenhouse gas mitigation' with an empty string
multi_risk['Climate_Risk_Mitigation'] = multi_risk['Climate_Risk_Mitigation'].str.replace('greenhouse gas mitigation', '')

# Remove any remaining commas
multi_risk['Climate_Risk_Mitigation'] = multi_risk['Climate_Risk_Mitigation'].str.replace(',', '')

# Clean-up view for easier access
data_preview = multi_risk[['Project Description', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]

# Use the index of multi_risk to update the corresponding rows in crosswalk_data
crosswalk_data_copy.update(multi_risk)

# Display the updated DataFrame
#print(crosswalk_data)

print(len(data_preview))
display(data_preview)

In [None]:
# Create a copy of the DataFrame to avoid modifying the original data
test = crosswalk_data_copy.copy()

# Filter rows with three or more commas in 'Climate_Risk_Mitigation' column
multi_risk = test.loc[(test['Climate_Risk_Mitigation'].str.count(',') >= 1)]

# Iterate over the climate risk dictionary to filter rows and update the DataFrame
for mitigation_type, keywords in climate_risk_dict.items():
    # Create a boolean mask to filter rows containing any of the keywords
    mask = test['CATEGORY'].str.contains('|'.join(keywords), case=False) | \
           test['SECTOR'].str.contains('|'.join(keywords), case=False)
    
    # Filter rows based on the mask
    filtered_rows = multi_risk[mask]
    
    # Update the 'Climate_Risk_Mitigation' column for the filtered rows
    filtered_rows['Climate_Risk_Mitigation'] = mitigation_type
    
    # Use the index of filtered rows to update the corresponding rows in crosswalk_data_copy
    test.update(filtered_rows)



# Display the updated DataFrame
display(test)


In [None]:
cleaned_crosswalk_data = crosswalk_data_copy.copy()

# Define the rows to update based on the specified criteria
sea_level_rise_rows = [15693, 62317, 74956, 89774]
inland_flooding_rows = [60114, 60160, 74973, 75775, 89750, 89847, 89918, 91016, 128903, 60253, 60265, 60292, 89679]
drought_rows = [41034, 75775, 89918, 113037, 119459, 75188, 75107, 75814, 89828, 89846, 109976, 110600]
wildfire_rows = [89455, 90049, 110288, 110291, 110294, 110297, 110298, 110303, 110305, 110333, 110337, 110339, 110347, 110361, 110368, 110372, 110447, 110466, 111867, 111874, 116515, 116520, 116541, 116543, 116548, 116582, 116583, 119503, 119516, 119518, 119554, 124581, 124582, 127999, 128030, 128063, 128083, 128144, 128262, 128280, 110373, 119509, 62321, 75163, 75165, 75166, 75167, 75168, 75169, 75170, 75171, 75172, 75173, 75174]
extreme_heat_rows = [75775]
greenhouse_gas_rows = [60117, 89961, 113029, 113036, 110373, 113030, 113026]
unclassified = [60109, 112994, 116130, 116131, 116132, 116133, 116685, 116843, 119568]

# Create a dictionary mapping mitigation types to their corresponding rows
mitigation_mapping = {
    'sea level rise mitigation': sea_level_rise_rows,
    'inland flooding mitigation': inland_flooding_rows,
    'drought mitigation': drought_rows,
    'wildfire mitigation': wildfire_rows,
    'extreme heat mitigation': extreme_heat_rows,
    'greenhouse gas mitigation': greenhouse_gas_rows,
    'unclassified': unclassified
}

# Iterate through the mitigation types and their corresponding rows
for mitigation_type, rows_to_update in mitigation_mapping.items():
    # Update the 'Climate_Risk_Mitigation' column for each row
    for row_index in rows_to_update:
        cleaned_crosswalk_data.loc[row_index, 'Climate_Risk_Mitigation'] = mitigation_type

multi_risk = cleaned_crosswalk_data.loc[(cleaned_crosswalk_data['Climate_Risk_Mitigation'].str.count(',') >= 1)]
print(len(multi_risk))
multi_risk

In [None]:
# identify how many have 3+ risks assigned
multi_risk = test.loc[(test['Climate_Risk_Mitigation'].str.count(',') == 1)]
data_preview = multi_risk[['Project Description', 'CATEGORY', 'SECTOR', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]
print(len(data_preview))
pd.set_option('display.max_rows', None)  
#data_preview

In [None]:
# identify how many have 3+ risks assigned
multi_risk = crosswalk_data_copy.loc[(crosswalk_data_copy['Climate_Risk_Mitigation'].str.count(',') >= 3)]
data_preview = multi_risk[['Project Description', 'CATEGORY', 'SECTOR', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]
print(len(data_preview))
data_preview

## 60 instances where there are 3 risks assigned -- most have ghg included and can be reduced to 2 to isolate between, but not all
## 3 instances where there are 4 risks assigned

In [None]:
specific_row = test.iloc[60230]
display(specific_row[['SECTOR']])