In [1]:
import os
import boto3
import pandas as pd

In [2]:
# Initialize the S3 client
s3_client = boto3.client('s3')

# Bucket name and file paths
bucket_name = 'ca-climate-index'
directory = '0_map_data/crosswalk_data/CCI_Projects_Project_Category_Update_02142024.xlsm'

print('Pulling file')
s3_client.download_file(bucket_name, directory, 'CCI_Projects_Project_Category_Update_02142024.xlsm')
print('File pulled')

Pulling file
File pulled


In [3]:
crosswalk_data = pd.read_excel('CCI_Projects_Project_Category_Update_02142024.xlsm')

## How many rows within the original dataset?

In [4]:
pd.set_option('display.max_columns', None)
print('Number of rows within dataset:', len(crosswalk_data))
#display(crosswalk_data)

Number of rows within dataset: 133698


## Display all columns

In [5]:
print('Number of columns:', len(crosswalk_data.columns.tolist()))
display(crosswalk_data.columns.tolist())

Number of columns: 130


['Project IDNumber',
 'Reporting Cycle Name',
 'Agency Name',
 'Program Name',
 'Program Description',
 'Sub Program Name',
 'Record Type',
 'Project Name',
 'Project Type',
 'Project Description',
 'SECTOR',
 'CATEGORY',
 'ACTION',
 'Census Tract',
 'Address',
 'Lat Long',
 'Senate\nDistrict',
 'Assembly\nDistrict',
 'County',
 'Total Project Cost',
 'Total Program GGRFFunding',
 'Project Life Years',
 'Total Project GHGReductions',
 'Annual Project GHGReductions',
 'Project Count',
 'Fiscal Year Funding Project',
 'Is Benefit Disadvantaged Communities',
 'Disadvantaged Community Criteria',
 'Disadvantaged Community Need',
 'Disadvantaged Community Census Tracts',
 'Total GGRFDisadvantaged Community Funding',
 'Disadvantaged Community Benefits Description',
 'Funding Benefiting Disadvantaged Communities',
 'Estimated Num Vehicles In Service',
 'Funding Within Disadvantage Communities',
 'Other Project Benefits Description',
 'VMTReductions',
 'Number Of Housing Units',
 'Number Of Aff

## Selecting columns relevant to initial analysis

In [6]:
relevant_columns = [
    'Program Name',
    'Program Description',
    'Sub Program Name',
    'Project Type',
    'Project Description',
    'SECTOR',
    'CATEGORY',
    'ACTION',
    'Census Tract',
    'Total Project GHGReductions',
    'Project Count'
]

In [7]:
data_of_interest = crosswalk_data[relevant_columns]

In [8]:
# Set display options to show all columns and rows
# pd.set_option('display.max_columns', None)  # To display all columns
# pd.set_option('display.max_rows', None)     # To display all rows

# Now display data_of_interest
display(data_of_interest)

Unnamed: 0,Program Name,Program Description,Sub Program Name,Project Type,Project Description,SECTOR,CATEGORY,ACTION,Census Tract,Total Project GHGReductions,Project Count
0,Low Carbon Transportation,Provides mobile source incentives to reduce GH...,Clean Vehicle Rebate Project,,CVRP promotes clean vehicle adoption in Califo...,"Zero-Emission Vehicles, Equipment, and Infrast...",Light-Duty Vehicles,Consumer Incentives,6.001400e+09,50.0,7.0
1,Low Carbon Transportation,Provides mobile source incentives to reduce GH...,Clean Vehicle Rebate Project,,CVRP promotes clean vehicle adoption in Califo...,"Zero-Emission Vehicles, Equipment, and Infrast...",Light-Duty Vehicles,Consumer Incentives,6.001400e+09,41.0,7.0
2,Low Carbon Transportation,Provides mobile source incentives to reduce GH...,Clean Vehicle Rebate Project,,CVRP promotes clean vehicle adoption in Califo...,"Zero-Emission Vehicles, Equipment, and Infrast...",Light-Duty Vehicles,Consumer Incentives,6.001400e+09,97.0,14.0
3,Low Carbon Transportation,Provides mobile source incentives to reduce GH...,Clean Vehicle Rebate Project,,CVRP promotes clean vehicle adoption in Califo...,"Zero-Emission Vehicles, Equipment, and Infrast...",Light-Duty Vehicles,Consumer Incentives,6.001400e+09,68.0,10.0
4,Low Carbon Transportation,Provides mobile source incentives to reduce GH...,Clean Vehicle Rebate Project,,CVRP promotes clean vehicle adoption in Califo...,"Zero-Emission Vehicles, Equipment, and Infrast...",Light-Duty Vehicles,Consumer Incentives,6.001400e+09,29.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
133693,Climate Resilience Planning,Works with local governments on adaptation pla...,Climate Change Adaptation and Coastal Resilien...,State Operations,BCDC's Regulatory staff conducts comprehensive...,Climate Adaptation and Resilience,Wetland Restoration and Management,Permitting,,0.0,1.0
133694,Climate Resilience Planning,Works with local governments on adaptation pla...,Climate Change Adaptation and Coastal Resilien...,State Operations,BCDC improves expertise on climate change issu...,Climate Adaptation and Resilience,Sea Level Rise,Capacity-Building,,0.0,1.0
133695,Climate Resilience Planning,Works with local governments on adaptation pla...,Climate Change Adaptation and Coastal Resilien...,State Operations,BCDC's Regulatory staff conducts comprehensive...,Climate Adaptation and Resilience,Wetland Restoration and Management,Permitting,,0.0,1.0
133696,,,,,,,,,,,


## Create a climate risk dictionary to scan through data based on dictionary values

In [9]:
climate_risk_dict = {
    'wildfire mitigation': ['wildfire', 'prescribed fire', 'fire prevention', 'burn', 'controlled burn', 'controlled_burning', 
                            'prescribed burn', 'prescribed burning' 'firefighting', 'reforest', 'reforestation', 'vegetation management', 
                            'roadside brushing', 'fuel break', 'fuel reduction', 'ignition', 'crown', 'fuel load', 'Fire and Forest Management'],
    
    'sea level rise mitigation': ['sea level rise', 'slr', 'seawall', 'seawalls', 'shoreline', 'wetland', 'mangrove', 'coastal'],
    
    'extreme heat mitigation': ['extreme heat', 'shade', 'shading', 'cooling center', 'cooling centers', 'heat-resistant', 
                                'heat resistant', 'heat reducing', 'heat-reducing'],
    
    'drought mitigation': ['drought', 'irrigation', 'soil moisture', 'rainwater harvest', 'rainwater harvesting', 'water storage', 
                           'water allocation', 'water management', 'soil health', 'soil management', 'organic matter', 'water efficiency'],
    
    'inland flooding mitigation': ['flooding', 'runoff', 'inland flood', 'inland flooding', 'floodplain', 'flood proof', 'floodproofing', 
                                   'elevated flood', 'flood barrier', 'flood barriers', 'drainage', 'riparian', 'stormwater'],
    
    'greenhouse gas mitigation': ['ghg', 'GHG', 'greenhouse gas', 'emission', 'emissions', 'carbon sequestration', 'electrification', 
                                  'carbon capture', 'solar power', 'renewable energy', 'wind energy', 'hydroelectricity', 'geothermal energy', 
                                  'biomass energy', 'Energy-efficiency']
}

## How many total and unique entries for each column? Will help decide which column to start with

In [10]:
def count_entries(dataframe):
    total_entries = dataframe.count()
    unique_entries = dataframe.nunique()
    return total_entries, unique_entries

total_entries, unique_entries = count_entries(data_of_interest)
print("Total entries per column:")
print(total_entries)
print("\nUnique entries per column:")
print(unique_entries)


Total entries per column:
Program Name                   133696
Program Description            133694
Sub Program Name               133696
Project Type                    19903
Project Description            133696
SECTOR                         133697
CATEGORY                       133697
ACTION                         133697
Census Tract                   114823
Total Project GHGReductions    133696
Project Count                  133696
dtype: int64

Unique entries per column:
Program Name                     39
Program Description              19
Sub Program Name                 76
Project Type                    963
Project Description            5235
SECTOR                           25
CATEGORY                         77
ACTION                           95
Census Tract                   7978
Total Project GHGReductions    2604
Project Count                   243
dtype: int64


## Deciding to loop through 'Project Description' first as it has a large number of variation to capture many datasets, and makes most practical scense for filtering climate keywords

In [11]:
def add_climate_risk_column(df, keyword_dict, output_csv=None):
    # Initialize new columns to store climate risk mitigation keywords, detected values, repeat counts, and total unique descriptions
    df['Climate_Risk_Mitigation'] = ''
    df['Detected_Climate_Risk_Mitigation_Keyword'] = ''
    df['Repeat_Project_Description_Count'] = 0

    # Initialize a counter for each keyword
    keyword_counter = {keyword: 0 for keyword in keyword_dict}

    # Create a dictionary to store the repeat count for each unique project description
    description_counts = {}

    # Create a dictionary to store the unique count for each keyword
    unique_keyword_counts = {keyword: set() for keyword in keyword_dict}

    # Iterate through each unique description
    unique_descriptions = df['Project Description'].unique()
    total_unique_descriptions = len(unique_descriptions)

    for description in unique_descriptions:
        # Find all rows with this description
        description_rows = df[df['Project Description'] == description]
        repeat_count = len(description_rows)
        # Update the repeat count for this description
        description_counts[description] = repeat_count

        # Iterate through each row with this description
        for index, row in description_rows.iterrows():
            keywords_found = set()  # To store unique keywords found in each row
            detected_values = []    # To store the detected values for each row
            # Iterate through each keyword in the dictionary
            for keyword, values in keyword_dict.items():
                # Check if any value of the keyword is present in the description (case-insensitive)
                detected = [val for val in values if val.lower() in description.lower()]
                if detected:
                    keywords_found.add(keyword)
                    keyword_counter[keyword] += 1
                    detected_values.extend(detected)
                    # Add the description to unique count for this keyword
                    unique_keyword_counts[keyword].add(description)

            # Update the 'Climate_Risk_Mitigation' column with unique keywords found
            df.at[index, 'Climate_Risk_Mitigation'] = ', '.join(keywords_found)
            # Update the 'Detected_Values' column with detected values
            df.at[index, 'Detected_Climate_Risk_Mitigation_Keyword'] = ', '.join(detected_values)
            # Update the 'Repeat_Project_Description_Count' column with the repeat count for this description
            df.at[index, 'Repeat_Project_Description_Count'] = repeat_count

    # Print keyword counts
    print("Keyword Counts:")
    for keyword, count in keyword_counter.items():
        print(f"{keyword}: {count}")
    print('')
    # Print total unique descriptions count
    print(f"Total Unique Project Descriptions: {total_unique_descriptions}")
    print('')

    # Print unique counts from each keyword
    print("Unique Counts from Each Keyword:")
    for keyword, unique_count in unique_keyword_counts.items():
        print(f"{keyword}: {len(unique_count)}")
        print('')
    # Save DataFrame as CSV if output_csv is provided
    if output_csv:
        df.to_csv(output_csv, index=False)
        print(f"DataFrame saved as {output_csv}")
        print('')
        # Initialize the S3 client
        s3_client = boto3.client('s3')

        # Bucket name and file paths
        bucket_name = 'ca-climate-index'
        directory = f'0_map_data/crosswalk_data/{output_csv}'
        # Upload the CSV file to S3
        print(f'Uploading {output_csv} to AWS')
        with open(output_csv, 'rb') as file:
            s3_client.upload_fileobj(file, bucket_name, directory)
            print(f'Upload complete! File is in {directory}')

## Testing function on whole dataset, the function will:
- loop through each 'Project Description' and look for words/phrases in our climate risk dictionary and append to keyword counter
- total keywords are counted
- number of unique 'Project Description' entries are counted, so are climate risk keys associate with those unique entries
- makes two new columns: 'Repeat Project Description Count' and 'Detected Climate Risk Mitigation Keyword' to add more context and improve dictionary keywords

## The cell below runs the function but also adds a few things:
- makes a data preview, just selecting relevant columns that were made and help interpret Project Description screening results
- orders the data in decending order from the Repeat Project Description Count to show Project Descriptions with multiple entries first (make sure dictionary is properly assigning large entries with correct climate risk)

In [12]:
add_climate_risk_column(crosswalk_data, climate_risk_dict) #, 'climate_risk_attributed_crosswalk_data.csv')
pd.set_option('display.max_colwidth', None)
data_preview = crosswalk_data[['Project Description', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]

# Filter the DataFrame to show only rows with entries in the 'Climate_Risk_Mitigation' column
data_preview_filtered = data_preview[data_preview['Climate_Risk_Mitigation'] != '']

# Sort the DataFrame based on 'Repeat_Project_Description_Count' in descending order
data_preview_filtered_sorted = data_preview_filtered.sort_values(by='Repeat_Project_Description_Count', ascending=False)

# Drop duplicates based on both 'Repeat_Project_Description_Count' and 'Project Description' to keep only one row per unique combination
data_preview_filtered_unique = data_preview_filtered_sorted.drop_duplicates(subset=['Repeat_Project_Description_Count', 'Project Description'])

#display(data_preview_filtered_unique)
display(data_preview_filtered_unique[:10])

Keyword Counts:
wildfire mitigation: 1338
sea level rise mitigation: 93
extreme heat mitigation: 186
drought mitigation: 2584
inland flooding mitigation: 211
greenhouse gas mitigation: 106865

Total Unique Project Descriptions: 5236

Unique Counts from Each Keyword:
wildfire mitigation: 755

sea level rise mitigation: 61

extreme heat mitigation: 130

drought mitigation: 923

inland flooding mitigation: 114

greenhouse gas mitigation: 858



Unnamed: 0,Project Description,Repeat_Project_Description_Count,Detected_Climate_Risk_Mitigation_Keyword,Climate_Risk_Mitigation
0,"CVRP promotes clean vehicle adoption in California by offering rebates from $1,000 to $7,502 for the purchase or lease of new, eligible zero-emission vehicles, including electric, plug-in hybrid electric and fuel cell vehicles.",82945,emission,greenhouse gas mitigation
101616,"Reduces agricultural sector emissions by providing incentives for agricultural harvesting equipment, heavy-duty trucks, agricultural pump engines, tractors, and other equipment used in agricultural operations",7787,"emission, emissions",greenhouse gas mitigation
114750,"Enhanced Fleet Modernization Program Plus-up Pilot Project (EFMP Plus-up) operates in conjunction with EFMP, the voluntary car scrap and replacement program implemented by ARB and local air districts in coordination with the Bureau of Automotive Repair. EFMP Plus-up provides additional incentives above the base EFMP incentive for lower-income consumers living in ZIP codes containing a disadvantaged community census tract who retire older vehicles and replace them with cleaner used or new hybrid, plug-in hybrid, or zero-emission vehicles. Funded with auction proceeds in 2014-15, 2015-16, and 2016-17. 2014‑15 and 2015-16 funding was limited to the San Joaquin Valley and South Coast air districts; 2016-17 funding is available to any air district that implements a vehicle scrap and replacement program that meets the minimum requirements established in ARB’s EFMP regulation and requests to participate.",6191,emission,greenhouse gas mitigation
62483,"Funding as per CAP Incentives Program, Carl Moyer Program and Proposition 1B Program to reduce air pollution and greenhouse gas emissions",3587,"greenhouse gas, emission, emissions",greenhouse gas mitigation
130682,"Clean Cars 4 All operates in conjunction with EFMP, the voluntary car scrap and replacement program implemented by CARB and local air districts in coordination with the Bureau of Automotive Repair. CC4A provides additional incentives above the base EFMP incentive for lower-income consumers living in ZIP codes containing a disadvantaged community census tract who retire older vehicles and replace them with cleaner used or new hybrid, plug-in hybrid, or zero-emission vehicles. Funded with auction proceeds in 2014-15, 2015-16, and 2016-17. 2014‑15 and 2015-16 funding was limited to the San Joaquin Valley and South Coast air districts; 2021-2022 funding was available to any air district that implements a vehicle scrap and replacement program that meets the minimum requirements established in CARB's CC4A regulation and requests to participate.",1816,emission,greenhouse gas mitigation
43245,EFMP Plus-up provides additional incentives above the base EFMP incentive for lower-income consumers living in ZIP codes containing a disadvantaged community census tract who replace older vehicles with cleaner used or new low/zero emission vehicles.,1308,emission,greenhouse gas mitigation
118081,"Provides combination of a vehicle price buy-down with a low-interest loan to lower-income consumers to purchase a new or used hybrid, plug-in hybrid, or zero-emission vehicle. Plug-in hybrid and zero-emission vehicle buyers can receive an EVSE incentive. Financial and vehicle education are provided for consumer protection.",1136,emission,greenhouse gas mitigation
128811,"The project will provide water efficiency devices that also conserve energy, including front-loading clothes washers, bathroom and kitchen faucet aerators and low flow showerheads. The project will also provide education, outreach, and customer assistance to residents living in the DAC area of the Grantee's service area.",1101,water efficiency,drought mitigation
115016,"$5 million grant to offer low-income residents low cost loans (up to 8 percent APR) and vehicle cost buy-down grants (up to $5,000) for the purchase of used or new clean vehicles (hybrid, plug-in hybrid, battery and fuel-cell zero-emission).",529,emission,greenhouse gas mitigation
61274,"Provide incentive rebates for single-family homeowners in Fresno, Clovis, and Reedley (Fresno County) to replace turf grass with Central Valley-appropriate drought tolerant landscapes.",261,drought,drought mitigation


---
Adding in tests to understand where more than one risk is assigned
- If 2 are provided, but one is GHG --> assign the category to the associated climate risk (i.e., "greenhouse gas mitigation, sea level rise mitigation" should end up as "sea level rise mitigation")
   - 654 instances
- If 2+ climate risks are assigned, need manual intervention to identify climate risk to be final assigned
   - Strip out all instances of "greenhouse gas mitigation" to reduce # of manual intervention
   - Identify the "main" or "priority" risk denoted in the project description

In [13]:
# crosswalk_data

## Eliminating 'greenhouse gas mitigation' entries when other climate risks present

In [14]:
# Create a copy of the DataFrame to avoid modifying the original data
crosswalk_data_copy = crosswalk_data.copy()

# Filter rows containing 'greenhouse gas mitigation'
multi_risk = crosswalk_data_copy.loc[(crosswalk_data_copy['Climate_Risk_Mitigation'].str.count(',') == 1) & 
                                (crosswalk_data_copy['Climate_Risk_Mitigation'].str.contains('greenhouse gas mitigation'))]

# Replace 'greenhouse gas mitigation' with an empty string in the 'Climate_Risk_Mitigation' column
crosswalk_data_copy.loc[multi_risk.index, 'Climate_Risk_Mitigation'] = multi_risk['Climate_Risk_Mitigation'].str.replace('greenhouse gas mitigation', '')

# Remove any remaining commas
crosswalk_data_copy.loc[multi_risk.index, 'Climate_Risk_Mitigation'] = crosswalk_data_copy.loc[multi_risk.index, 'Climate_Risk_Mitigation'].str.replace(',', '')

# Clean-up view for easier access
data_preview = multi_risk[['Project Description', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]

# Display the updated DataFrame
#print(crosswalk_data)

print(len(data_preview))
display(data_preview)

654


Unnamed: 0,Project Description,Repeat_Project_Description_Count,Detected_Climate_Risk_Mitigation_Keyword,Climate_Risk_Mitigation
15690,"Construction of a 700 ac Whale's Mouth Wetland and restoration of 1,000 acres of Belly Wetland. Permanent palustrine emergent wetlands will sequester GHG, provide co-benefits (subsidence reversal, improved levee stability, wildlife habitat).",1,"wetland, ghg, GHG","greenhouse gas mitigation, sea level rise mitigation"
15692,Restore 80 acres of Childs Meadow using cost-effective Beaver Dam Analogues and riparian fencing. Restoration actions are designed to increase carbon sequestration and provide co-benefits.,1,"riparian, carbon sequestration","greenhouse gas mitigation, inland flooding mitigation"
15694,Restore 61 acres of tidal salt marsh and 5 acres of a perennial grassland buffer in the southern area of Elkhorn Slough. The project is designed to restore coastal wetlands to reduce GHGs and improve important estuarine habitat.,1,"wetland, coastal, ghg, GHG","greenhouse gas mitigation, sea level rise mitigation"
15801,Installation of a solar power plant to power well pumps and installation of soil moisture monitoring hardware. Irrigation scheduling management plan will be implemented using existing soil moisture data and CIMIS data from closest station.,1,"irrigation, soil moisture, solar power","greenhouse gas mitigation, drought mitigation"
15808,"Existing well will be retrofitted with solar powered pump, solar array and water storage tanks sufficient to support the wells daily output and catch storm water. Installation of soil moisture monitoring devices.",1,"soil moisture, water storage, solar power","greenhouse gas mitigation, drought mitigation"
...,...,...,...,...
133466,"Fuels Reduction Crews implement treatments on forest, woodland, shrubland, and grass landscapes to reduce fuel mass and fire-risk characteristics, and thereby, wildfire potential for California communities and ecosystems. Treatments consist primarily of fuel reduction, fuel breaks, prescribed burns, and right of way clearance. Crews use manual and mechanical means to remove and rearrange surface and ladder fuels, thin stands, conduct prescribed burns, and execute combinations thereof, to lower the rates of fire spread, duration and intensity of fire, fuel ignitability, and ignition of tree crowns, and/or to create defensible spaces that enable more effective fire suppression efforts. Crews' roles may include support to prepare sites for prescribed burns, targeted activities such as tree felling, limbing and bucking, and road maintenance such as erosion control, road assessment, and road grading, as well as responding to incidents. In addition to conducting these activities, Fuels Reduction Crews undergo training, perform project planning and administration, assist with surveys, participate in special projects, and contribute to facility and equipment maintenance and improvement. California's Greenhouse Gas Reduction Fund funds the Fuels Reduction Crews, including the establishment, enhancement, repair and provision of the facilities in which they operate and the supplying of associated operational needs.",27,"wildfire, burn, prescribed burn, fuel break, fuel reduction, ignition, crown, greenhouse gas","greenhouse gas mitigation, wildfire mitigation"
133467,"Fuels Reduction Crews and Fuels Heavy Fire Equipment Operators implement treatments on forest, woodland, shrubland, and grass landscapes to reduce fuel mass and fire-risk characteristics, and thereby, wildfire potential for California communities and ecosystems. Treatments consist primarily of fuel reduction, fuel breaks, prescribed burns, and right of way clearance. Crews use manual and mechanical means to remove and rearrange surface and ladder fuels, thin stands, conduct prescribed burns, and execute combinations thereof, to lower the rates of fire spread, duration and intensity of fire, fuel ignitability, and ignition of tree crowns, and/or to create defensible spaces that enable more effective fire suppression efforts. Crews' roles may include support to prepare sites for prescribed burns, targeted activities such as tree felling, limbing and bucking, and road maintenance such as erosion control, road assessment, and road grading, as well as responding to incidents. In addition to conducting these activities, Fuels Reduction Crews undergo training, perform project planning and administration, assist with surveys, participate in special projects, and contribute to facility and equipment maintenance and improvement. California's Greenhouse Gas Reduction Fund funds the Fuels Reduction Crews, including the establishment, enhancement, repair and provision of the facilities in which they operate and the supplying of associated operational needs.",30,"wildfire, burn, prescribed burn, fuel break, fuel reduction, ignition, crown, greenhouse gas","greenhouse gas mitigation, wildfire mitigation"
133499,Cultural burning awareness training incorporates tribal ecological knowledge and experience from cultural burners to develop State employee understanding of the connections to our environment and the culture of tribal nations. California's Greenhouse Gas Reduction Fund funding contributed to training conducted by the Cache Creek Conservancy and California Wildlife Foundation.,1,"burn, greenhouse gas","greenhouse gas mitigation, wildfire mitigation"
133549,"City of El Centro will hire two separate contractors to develop, under city supervision, an Advanced Tree Inventory and Management System (ATIM) to assist in developing an effective urban forest management plan. 500 drought tolerant trees will be planted and maintained across the county in the cities of El Centro, Calexico, Brawley, Imperial and Holtville. Education programs will be developed to ensure that citizens are involved and offer job training and opportunities. This project will support and build upon the GHG tracking and sequestration benefits from an existing grant administered by Tree San Diego, a local nonprofit group.",10,"drought, ghg, GHG","greenhouse gas mitigation, drought mitigation"


## Look through other entries with multiple detected climate risk mitigations (not selected for greenhouse gas mitigation)
* further filter by running keyword dictionary filter across the 'CATEGORY' and 'SECTOR' columns
* attribute just the newly found climate mitigation to the climate risk mitigation column

In [15]:
# Filter rows with three or more commas in 'Climate_Risk_Mitigation' column
multi_risk = crosswalk_data_copy.loc[crosswalk_data_copy['Climate_Risk_Mitigation'].str.count(',') >= 1]

# Iterate over the climate risk dictionary to filter rows and update the DataFrame
for mitigation_type, keywords in climate_risk_dict.items():
    # Create a boolean mask to filter rows containing any of the keywords
    mask = crosswalk_data_copy['CATEGORY'].str.contains('|'.join(keywords), case=False) | \
           crosswalk_data_copy['SECTOR'].str.contains('|'.join(keywords), case=False)
    
    # Filter rows based on the mask
    filtered_rows = multi_risk[mask]
    
    # Update the 'Climate_Risk_Mitigation' column for the filtered rows
    crosswalk_data_copy.loc[filtered_rows.index, 'Climate_Risk_Mitigation'] = mitigation_type

# Display the updated DataFrame
# display(crosswalk_data_copy)


  filtered_rows = multi_risk[mask]
  filtered_rows = multi_risk[mask]
  filtered_rows = multi_risk[mask]
  filtered_rows = multi_risk[mask]
  filtered_rows = multi_risk[mask]
  filtered_rows = multi_risk[mask]


In [16]:
# identify how many have 3+ risks assigned
multi_risk = crosswalk_data_copy.loc[(crosswalk_data_copy['Climate_Risk_Mitigation'].str.count(',') >= 1)]
data_preview = multi_risk[['Project Description', 'CATEGORY', 'SECTOR', 'Repeat_Project_Description_Count', 'Detected_Climate_Risk_Mitigation_Keyword', 'Climate_Risk_Mitigation']]
print(len(data_preview))
pd.set_option('display.max_rows', None)  
#data_preview

85


## Manually go through remaining sources that have multiple climate risk mitigation entries and didnt get further filtered with other column subsetting
* get their row number, read the project description, and attribute the number to one of the climate risks
* descriptions that seemingly address 2+ climate risk mitigations somewhat equally are give both risk mitigations

In [23]:
cleaned_crosswalk_data = crosswalk_data_copy.copy()

# Define the rows to update based on the specified criteria
sea_level_rise_rows = [15693, 62317, 74956, 89774]
inland_flooding_rows = [60114, 60160, 74973, 75775, 89750, 89847, 89918, 91016, 128903, 60253, 60265, 60292, 89679]
drought_rows = [41034, 75775, 89918, 113037, 119459, 75188, 75107, 75814, 89828, 89846, 109976, 110600]
wildfire_rows = [89455, 90049, 110288, 110291, 110294, 110297, 110298, 110303, 110305, 110333, 110337, 110339, 110347, 110361, 110368, 110372, 110447, 110466, 111867, 111874, 116515, 116520, 116541, 116543, 116548, 116582, 116583, 119503, 119516, 119518, 119554, 124581, 124582, 127999, 128030, 128063, 128083, 128144, 128262, 128280, 110373, 119509, 62321, 75163, 75165, 75166, 75167, 75168, 75169, 75170, 75171, 75172, 75173, 75174]
extreme_heat_rows = [75775]
greenhouse_gas_rows = [60117, 89961, 113029, 113036, 110373, 113030, 113026, 60109]
unclassified = [112994, 116130, 116131, 116132, 116133, 116685, 116843, 119568]

# Create a dictionary mapping mitigation types to their corresponding rows
mitigation_mapping = {
    'sea level rise mitigation': sea_level_rise_rows,
    'inland flooding mitigation': inland_flooding_rows,
    'drought mitigation': drought_rows,
    'wildfire mitigation': wildfire_rows,
    'extreme heat mitigation': extreme_heat_rows,
    'greenhouse gas mitigation': greenhouse_gas_rows,
    'unclassified': unclassified
}

# Iterate through the mitigation types and their corresponding rows
for mitigation_type, rows_to_update in mitigation_mapping.items():
    # Update the 'Climate_Risk_Mitigation' column for each row
    for row_index in rows_to_update:
        cleaned_crosswalk_data.loc[row_index, 'Climate_Risk_Mitigation'] = mitigation_type

multi_risk = cleaned_crosswalk_data.loc[(cleaned_crosswalk_data['Climate_Risk_Mitigation'].str.count(',') >= 1)]
# How many rows that have multiple climate risk mitigation entries
print('Number of rows with multiple climate risk mitigations:', len(multi_risk))
#multi_risk


Number of rows with multiple climate risk mitigations: 33


## Re-tally the final counts per climate risk after the additional automated and manual filtering

In [24]:
def count_climate_risk_mitigation(df, keyword_dict, output_csv=None):

    # Initialize a counter for each keyword
    keyword_counter = {keyword: 0 for keyword in keyword_dict}

    # Create a dictionary to store the repeat count for each unique project description
    description_counts = {}

    # Create a dictionary to store the unique count for each keyword
    unique_keyword_counts = {keyword: set() for keyword in keyword_dict}

    # Iterate through each unique description
    unique_descriptions = df['Project Description'].unique()
    total_unique_descriptions = len(unique_descriptions)

    for description in unique_descriptions:
        # Find all rows with this description
        description_rows = df[df['Project Description'] == description]
        repeat_count = len(description_rows)
        # Update the repeat count for this description
        description_counts[description] = repeat_count

        # Iterate through each row with this description
        for index, row in description_rows.iterrows():
            # Iterate through each keyword in the dictionary
            for keyword, values in keyword_dict.items():
                # Check if any value of the keyword is present in the description (case-insensitive)
                detected = [val for val in values if val.lower() in description.lower()]
                if detected:
                    keyword_counter[keyword] += 1
                    # Add the description to unique count for this keyword
                    unique_keyword_counts[keyword].add(description)

            # Update the 'Repeat_Project_Description_Count' column with the repeat count for this description
            df.at[index, 'Repeat_Project_Description_Count'] = repeat_count

    # Print keyword counts
    print("Keyword Counts:")
    for keyword, count in keyword_counter.items():
        print(f"{keyword}: {count}")
    print('')
    # Print total unique descriptions count
    print(f"Total Unique Project Descriptions: {total_unique_descriptions}")
    print('')

    # Print unique counts from each keyword
    print("Unique Counts from Each Keyword:")
    for keyword, unique_count in unique_keyword_counts.items():
        print(f"{keyword}: {len(unique_count)}")
        print('')
    # Save DataFrame as CSV if output_csv is provided
    if output_csv:
        df.to_csv(output_csv, index=False)
        print(f"DataFrame saved as {output_csv}")
        print('')
        # Initialize the S3 client
        s3_client = boto3.client('s3')

        # Bucket name and file paths
        bucket_name = 'ca-climate-index'
        directory = f'0_map_data/crosswalk_data/{output_csv}'
        # Upload the CSV file to S3
        print(f'Uploading {output_csv} to AWS')
        with open(output_csv, 'rb') as file:
            s3_client.upload_fileobj(file, bucket_name, directory)
            print(f'Upload complete! File is in {directory}')

In [25]:
count_climate_risk_mitigation(cleaned_crosswalk_data, climate_risk_dict)

Keyword Counts:
wildfire mitigation: 1338
sea level rise mitigation: 93
extreme heat mitigation: 186
drought mitigation: 2584
inland flooding mitigation: 211
greenhouse gas mitigation: 106865

Total Unique Project Descriptions: 5236

Unique Counts from Each Keyword:
wildfire mitigation: 755

sea level rise mitigation: 61

extreme heat mitigation: 130

drought mitigation: 923

inland flooding mitigation: 114

greenhouse gas mitigation: 858



In [20]:
len(cleaned_crosswalk_data)

133698

## Get rid of columns used for analysis, create csv, and upload to AWS

In [27]:
# Drop the specified columns from cleaned_crosswalk_data to create a new DataFrame
final_crosswalk_data = cleaned_crosswalk_data.drop(columns=['Detected_Climate_Risk_Mitigation_Keyword', 'Repeat_Project_Description_Count'])

output_csv = 'crosswalk_climate_risk_mitigation_column.csv'

final_crosswalk_data.to_csv(output_csv, index=False)
print(f"DataFrame saved as {output_csv}")
print('')
# Initialize the S3 client
s3_client = boto3.client('s3')

# Bucket name and file paths
bucket_name = 'ca-climate-index'
directory = f'0_map_data/crosswalk_data/{output_csv}'
# Upload the CSV file to S3
print(f'Uploading {output_csv} to AWS')
with open(output_csv, 'rb') as file:
    s3_client.upload_fileobj(file, bucket_name, directory)
    print(f'Upload complete! File is in {directory}')

DataFrame saved as crosswalk_climate_risk_mitigation_column.csv

Uploading crosswalk_climate_risk_mitigation_column.csv to AWS
Upload complete! File is in 0_map_data/crosswalk_data/crosswalk_climate_risk_mitigation_column.csv


## View Unclassified rows
* entries that were caught by one of the keywords but doesnt seemingly belong to any

In [26]:
# Filter rows in multi_risk DataFrame that belong to the 'unclassified' category
unclassified_rows = cleaned_crosswalk_data[cleaned_crosswalk_data['Climate_Risk_Mitigation'] == 'unclassified']

# Display the rows in the 'unclassified' category
print(len(unclassified_rows))
display(unclassified_rows[:1])

8


Unnamed: 0,Project IDNumber,Reporting Cycle Name,Agency Name,Program Name,Program Description,Sub Program Name,Record Type,Project Name,Project Type,Project Description,SECTOR,CATEGORY,ACTION,Census Tract,Address,Lat Long,Senate\nDistrict,Assembly\nDistrict,County,Total Project Cost,Total Program GGRFFunding,Project Life Years,Total Project GHGReductions,Annual Project GHGReductions,Project Count,Fiscal Year Funding Project,Is Benefit Disadvantaged Communities,Disadvantaged Community Criteria,Disadvantaged Community Need,Disadvantaged Community Census Tracts,Total GGRFDisadvantaged Community Funding,Disadvantaged Community Benefits Description,Funding Benefiting Disadvantaged Communities,Estimated Num Vehicles In Service,Funding Within Disadvantage Communities,Other Project Benefits Description,VMTReductions,Number Of Housing Units,Number Of Affordable Housing Units,Estimated Number Of Trees To Be Planted,Energy Cost Savings,Estimated Energy Saved KWH,Estimated Energy Saved Therms,Estimated Water Saved Gallons,Estimated Energy Generated KWH,Estimated Fuel Use Reduction Gal,Vouchers Benefiting Disadvantaged Communities,Number Of Rebates Issued,Rebates Within Disadvantaged Communities,Date Operational,Project Completion Date,Date Imported,Funding Recipient,AB1550Choice,Buffer Amount,Buffer Count,Is AB1550Buffer Region,CESVersion,CESVersion Calc,DAC1550Amount,DAC1550Count,Is Benefit DAC1550Communities,DACTable,FG17Comm Need,FG17Comm Need Qual,DACCommunity Benefit Critieria Met,Date Selected For Award,Low Income Amount,Low Income Count,Low Income Household Amount,Low Income Household Count,Is Low Income Communities,Potential Buffer Amount,Potential Buffer Count,Potential DAC1550Amount,Potential DAC1550Count,Potential Low Income Amount,Potential Low Income Count,Estimated Acres Preserved,Estimated Acres Restored,Estimated Acres Treated,Estimated Diverted From Landfills Tons,Renewable Fuel Generation Tons,Wood Burning Reduction Cords,Estimated Ridership Increases,Black Carbon Reductions Pounds,StateEW_DPM,StateEW_Nox,StateEW_PM25,StateEW_ROG,Diesel Pm Reductions Pounds,NOx Reductions Pounds,Pm25Reductions Pounds,Rog Reductions Pounds,Estimated Total Recycling Tons,Estimated Waste Digested Tons,Reclaimed Food Tons,NUMBER OF PLANS,POLLINATOR ACREAGE,RESEARCH GRANT,SCIENCE ADVANCEMENT,SOIL BENEFIT,TRAVEL COST SAVINGS,EST ENERGY GEN SCF,EST SOURCE RED TONS,FUEL TREATMENT NUM,EDUCATION EVENT NUM,ENERGY AUDIT BUILDINGS,EST DIVERT LANDFILLS TONS,EST FUEL GEN GAL,Voucher ID,Voucher Name,Voucher Description,Direct Jobs Fte,Indirect Jobs Fte,Induced Jobs Fte,Climate Adaptation,Community Engagement,Compost Produced Tons,Compost Produced Tons Yr,Net Density DUA,Applicants Assisted,Invasive Cover 12 Months,Invasive Cover 36 Months,Project Acreage,IS IAE,Intermediary Admin Expenses Calc,PRIMARY_FUNDING_RECIPIENT_TYPE,TRIBAL AFFILIATION,PROJECT PARTNERS,Climate_Risk_Mitigation,Detected_Climate_Risk_Mitigation_Keyword,Repeat_Project_Description_Count
112994,170561HS,2018 Semi-Annual,California Department of Food and Agriculture,Climate Smart Agriculture,"Invests in competitive projects that reduce GHGs through increased efficiency in the agricultural sector. The program is comprised of four components, the Dairy Digester Research and Development Program, Alternative Manure Management, Healthy Soils Program and the State Water Efficiency and Enhancement Program.",Healthy Soils Program,AWARD-IMP,True Grass Farms,Implementation of conservation management practices on agricultural land,True Grass Farms will integrate trees and carbon sequestering potential to the currently regenerative managed coastal prairie which promotes and show case for future farms the potential of a diversified agricultural production zone. The Recipient will apply the following practices: 1) Compost application to grazed rangeland to five acres; 2) CPS 391 - Riparian forest buffer by replacing a strip of grassland near water ways for seven acres and one acres of large container / one acres of small container / five acres of medium to large cuttings; 3) CPS 422 - Hedgerow planting (linear wind break) on 1200 ft; 4) CPS 381 - Sylvo pasture – for seven acres.,Climate smart agriculture,Healthy Soils Practices,Commercial Incentive,,"2711 Whitaker Bluff Rd, Tomales, 94971, Marin","38.2731989, -122.9239341",2,12,Marin,65583.0,49955.0,3,120.0,0.0,1.0,2016-17,0.0,,,,,Project reduced exposure to dust and aiborne particles to residents,,0.0,,water and air quality protection; food production,0.0,0.0,0.0,345.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-12-31,12/01/2019,2022-09-23 14:59:43.957,-,,0.0,0.0,0.0,3.0,1550.0,0.0,0.0,0.0,Healthy Soils,D,Reduce exposure to local environmental contaminants,A,2017-12-05,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,49955.0,1.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,unclassified,"coastal, riparian",1
