### Cal-CRAI metric calculation: Crop Loss
This notebook calculates crop loss metrics across 2 different climate risks:
* Drought/crop loss: average # of acres lost from drought per year
* Drought/crop loss: average cost of crop loss from drought per year
* Heat/crop loss: average # of acres lost from extreme heat per year
* Heat/crop loss: average cost of crop loss from extreme heat per year

In [1]:
import geopandas as gpd
import s3fs
import pandas as pd
import boto3
import dask_geopandas
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
import sys
import numpy as np

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

Unnamed: 0,tract,countyfp,county
0,06085504321,085,santa clara
1,06085504410,085,santa clara
2,06085507003,085,santa clara
3,06085507004,085,santa clara
4,06085502204,085,santa clara
...,...,...,...
9124,06059001303,059,orange
9125,06059001304,059,orange
9126,06059001401,059,orange
9127,06013367200,013,contra costa


In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/climate_risk/extreme_heat/loss/usda/usda_crop_loss_heat_files/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'usda_crop_loss_merged.csv'


In [4]:
all_events = pd.read_csv('all_noaa_storm_events_ca.csv')
all_events.head(5)

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,199707,27,2032,199707,27,2032,1048394,5617146,CALIFORNIA,6,...,,,BLYTHE,33.62,-114.6,33.62,-114.6,,Thunderstorm wind gust was measured at the Bly...,PDC
1,199707,28,1700,199707,28,1700,2070479,5622118,CALIFORNIA,6,...,,,HAPPY CAMP,,,,,,FOREST SERVICE REPORTED A FLASH FLOOD ON GRANI...,PDC
2,199707,27,1600,199707,27,1800,2070480,5622119,CALIFORNIA,6,...,8.0,S,ETNA,41.33,-122.9,41.33,-122.9,,,PDC
3,199707,27,1600,199707,27,1800,2070480,5622120,CALIFORNIA,6,...,4.0,S,ETNA,,,,,,1.5 INCHES OF RAIN IN 2 HRS RESULTED IN MINOR ...,PDC
4,199707,30,1600,199707,30,1600,2070481,5622121,CALIFORNIA,6,...,2.0,S,FT JONES,,,,,,2 INCHES OF RAIN IN 30 MIN. RESULTED IN 2 FT. ...,PDC


In [5]:
all_events.columns

Index(['BEGIN_YEARMONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'END_YEARMONTH',
       'END_DAY', 'END_TIME', 'EPISODE_ID', 'EVENT_ID', 'STATE', 'STATE_FIPS',
       'YEAR', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_TYPE', 'CZ_FIPS', 'CZ_NAME',
       'WFO', 'BEGIN_DATE_TIME', 'CZ_TIMEZONE', 'END_DATE_TIME',
       'INJURIES_DIRECT', 'INJURIES_INDIRECT', 'DEATHS_DIRECT',
       'DEATHS_INDIRECT', 'DAMAGE_PROPERTY', 'DAMAGE_CROPS', 'SOURCE',
       'MAGNITUDE', 'MAGNITUDE_TYPE', 'FLOOD_CAUSE', 'CATEGORY', 'TOR_F_SCALE',
       'TOR_LENGTH', 'TOR_WIDTH', 'TOR_OTHER_WFO', 'TOR_OTHER_CZ_STATE',
       'TOR_OTHER_CZ_FIPS', 'TOR_OTHER_CZ_NAME', 'BEGIN_RANGE',
       'BEGIN_AZIMUTH', 'BEGIN_LOCATION', 'END_RANGE', 'END_AZIMUTH',
       'END_LOCATION', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON',
       'EPISODE_NARRATIVE', 'EVENT_NARRATIVE', 'DATA_SOURCE'],
      dtype='object')

In [32]:
# isolate for relevant columns
columns = ['EVENT_ID','YEAR', 'EVENT_TYPE','CZ_TYPE','CZ_FIPS',
           'DAMAGE_CROPS','BEGIN_LAT','BEGIN_LON',
          'BEGIN_RANGE','END_RANGE']
noaa_storm_columns = all_events[columns]
noaa_storm_columns

Unnamed: 0,EVENT_ID,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_CROPS,BEGIN_LAT,BEGIN_LON,BEGIN_RANGE,END_RANGE
0,5617146,1997,Thunderstorm Wind,C,65,,33.6200,-114.6000,,
1,5622118,1997,Flash Flood,C,93,,,,,
2,5622119,1997,Hail,C,93,,41.3300,-122.9000,8.0,8.0
3,5622120,1997,Flash Flood,C,93,,,,4.0,4.0
4,5622121,1997,Flash Flood,C,93,,,,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...
26959,983437,2020,Excessive Heat,Z,566,0.00K,,,,
26960,920678,2020,High Wind,Z,568,0.00K,,,,
26961,916656,2020,Tornado,C,35,0.00K,39.7313,-120.1284,3.0,3.0
26962,916709,2020,Tornado,C,35,0.00K,39.7239,-120.1303,32.0,32.0


In [13]:
# checking how many nan entries are within one of the lat columns
nan_count = noaa_storm_columns.BEGIN_LAT.isna().sum()
print(f"Number of NaN values in BEGIN_LAT: {nan_count}")

Number of NaN values in BEGIN_LAT: 20517


In [14]:
# view all entries within the event type columns
noaa_storm_columns.EVENT_TYPE.unique()

array(['Thunderstorm Wind', 'Flash Flood', 'Hail', 'Wildfire',
       'High Surf', 'Heavy Rain', 'Dust Storm', 'Heat', 'Tornado',
       'Lightning', 'Heavy Snow', 'Flood', 'Winter Storm', 'Rip Current',
       'Funnel Cloud', 'High Wind', 'Waterspout', 'Cold/Wind Chill',
       'Winter Weather', 'Storm Surge/Tide', 'Strong Wind', 'Dust Devil',
       'Dense Fog', 'Tropical Storm', 'Frost/Freeze', 'Ice Storm',
       'Coastal Flood', 'Debris Flow', 'Avalanche', 'Blizzard', 'Drought',
       'Extreme Cold/Wind Chill', 'Tsunami', 'Excessive Heat',
       'Dense Smoke', 'Sneakerwave', 'Freezing Fog',
       'Astronomical Low Tide'], dtype=object)

### Heat Metrics

In [19]:
# select for relevent flood related events
heat_types = [    
    'Excessive Heat',  
    'Heat',
]
cz_type_county = ['C']

# filter for heat events based on event type
heat_events = noaa_storm_columns[noaa_storm_columns['EVENT_TYPE'].isin(heat_types)]
heat_events

Unnamed: 0,EVENT_ID,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_CROPS,BEGIN_LAT,BEGIN_LON
15,5609937,1997,Heat,Z,46,,,
24,5615459,1997,Heat,Z,48,,,
25,5615460,1997,Heat,Z,61,,,
26,5615461,1997,Heat,Z,62,,,
27,5615462,1997,Heat,Z,57,,,
...,...,...,...,...,...,...,...,...
26955,983439,2020,Excessive Heat,Z,569,0.00K,,
26956,983440,2020,Excessive Heat,Z,570,0.00K,,
26957,983441,2020,Excessive Heat,Z,562,0.00K,,
26958,983436,2020,Excessive Heat,Z,565,0.00K,,


In [20]:
heat_events.DAMAGE_CROPS.unique()

array([nan, '1K', '0', '224.8M', '106.6M', '159M', '2M', '0.00K'],
      dtype=object)

In [21]:
# function to convert 'M' and 'K' suffixes while properly adjusting value
def convert_to_numeric(value):
    if pd.isna(value):
        return np.nan
    value = value.strip()
    if value.endswith('M'):
        return float(value[:-1]) * 1e6
    elif value.endswith('K'):
        return float(value[:-1]) * 1e3
    elif value.endswith('B'):
        return float(value[:-1]) * 1e9
    else:
        return float(value)
    
# Apply the function to the DAMAGE_CROPS column
heat_events['DAMAGE_CROPS'] = heat_events['DAMAGE_CROPS'].apply(convert_to_numeric)

# Display the DataFrame to verify the conversion
heat_events

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heat_events['DAMAGE_CROPS'] = heat_events['DAMAGE_CROPS'].apply(convert_to_numeric)


Unnamed: 0,EVENT_ID,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_CROPS,BEGIN_LAT,BEGIN_LON
15,5609937,1997,Heat,Z,46,,,
24,5615459,1997,Heat,Z,48,,,
25,5615460,1997,Heat,Z,61,,,
26,5615461,1997,Heat,Z,62,,,
27,5615462,1997,Heat,Z,57,,,
...,...,...,...,...,...,...,...,...
26955,983439,2020,Excessive Heat,Z,569,0.0,,
26956,983440,2020,Excessive Heat,Z,570,0.0,,
26957,983441,2020,Excessive Heat,Z,562,0.0,,
26958,983436,2020,Excessive Heat,Z,565,0.0,,


In [22]:
columns_to_sum = ['DAMAGE_CROPS']
heat_loss = heat_events.groupby('CZ_FIPS')[columns_to_sum].sum().reset_index()
heat_loss = heat_loss.rename(columns={'CZ_FIPS': 'countyfp', 'DAMAGE_CROPS':'estimated_crop_loss_cost'})
heat_loss['countyfp'] = heat_loss['countyfp'].astype(str).str.zfill(3)
heat_loss

Unnamed: 0,countyfp,estimated_crop_loss_cost
0,003,0.0
1,004,0.0
2,005,0.0
3,006,0.0
4,007,0.0
...,...,...
115,566,0.0
116,567,0.0
117,568,0.0
118,569,0.0


In [23]:
# merge heat loss data with California census tract data
heat_loss_merge = pd.merge(ca_tract_county, heat_loss, on='countyfp', how='left')
heat_loss_merge

Unnamed: 0,tract,countyfp,county,estimated_crop_loss_cost
0,06085504321,085,santa clara,0.0
1,06085504410,085,santa clara,0.0
2,06085507003,085,santa clara,0.0
3,06085507004,085,santa clara,0.0
4,06085502204,085,santa clara,0.0
...,...,...,...,...
9124,06059001303,059,orange,
9125,06059001304,059,orange,
9126,06059001401,059,orange,
9127,06013367200,013,contra costa,0.0


In [24]:
# save metric calculated data as separate csvs for upload
heat_crop_loss_metric = heat_loss_merge[['tract', 'county', 'estimated_crop_loss_cost']]
heat_crop_loss_metric.to_csv('climate_heat_crop_loss_metric.csv', index=False)

### Drought metrics

In [33]:
# select for relevent flood related events
drought_types = ['Drought']
cz_type_county = ['C']

# filter for heat events based on event type
drought_events = noaa_storm_columns[noaa_storm_columns['EVENT_TYPE'].isin(drought_types)]
drought_events

Unnamed: 0,EVENT_ID,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_CROPS,BEGIN_LAT,BEGIN_LON,BEGIN_RANGE,END_RANGE
2176,5721830,1999,Drought,Z,21,,,,,
2328,5721829,1999,Drought,Z,20,,,,,
3089,5160649,2000,Drought,Z,92,,,,,
3090,5160650,2000,Drought,Z,96,,,,,
3121,5160648,2000,Drought,Z,90,,,,,
...,...,...,...,...,...,...,...,...,...,...
26910,934554,2020,Drought,Z,17,,,,,
26911,934555,2020,Drought,Z,63,,,,,
26912,934560,2020,Drought,Z,16,,,,,
26913,934564,2020,Drought,Z,15,,,,,


In [26]:
drought_events.DAMAGE_CROPS.unique()

array([nan, '0.00K', '2.00M', '2.00K', '1.50B'], dtype=object)

In [27]:
# Apply the function to the DAMAGE_CROPS column
drought_events['DAMAGE_CROPS'] = drought_events['DAMAGE_CROPS'].apply(convert_to_numeric)

# Display the DataFrame to verify the conversion
drought_events

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drought_events['DAMAGE_CROPS'] = drought_events['DAMAGE_CROPS'].apply(convert_to_numeric)


Unnamed: 0,EVENT_ID,YEAR,EVENT_TYPE,CZ_TYPE,CZ_FIPS,DAMAGE_CROPS,BEGIN_LAT,BEGIN_LON
2176,5721830,1999,Drought,Z,21,,,
2328,5721829,1999,Drought,Z,20,,,
3089,5160649,2000,Drought,Z,92,,,
3090,5160650,2000,Drought,Z,96,,,
3121,5160648,2000,Drought,Z,90,,,
...,...,...,...,...,...,...,...,...
26910,934554,2020,Drought,Z,17,,,
26911,934555,2020,Drought,Z,63,,,
26912,934560,2020,Drought,Z,16,,,
26913,934564,2020,Drought,Z,15,,,


In [28]:
columns_to_sum = ['DAMAGE_CROPS']
drought_loss = drought_events.groupby('CZ_FIPS')[columns_to_sum].sum().reset_index()
drought_loss = drought_loss.rename(columns={'CZ_FIPS': 'countyfp', 'DAMAGE_CROPS':'estimated_crop_loss_cost'})
drought_loss['countyfp'] = drought_loss['countyfp'].astype(str).str.zfill(3)
drought_loss

Unnamed: 0,countyfp,estimated_crop_loss_cost
0,13,0.0
1,15,0.0
2,16,0.0
3,17,0.0
4,18,0.0
5,19,1500000000.0
6,20,0.0
7,21,0.0
8,42,0.0
9,43,0.0


In [29]:
# merge drought loss data with California census tract data
drought_loss_merge = pd.merge(ca_tract_county, drought_loss, on='countyfp', how='left')
drought_loss_merge

Unnamed: 0,tract,countyfp,county,estimated_crop_loss_cost
0,06085504321,085,santa clara,
1,06085504410,085,santa clara,
2,06085507003,085,santa clara,
3,06085507004,085,santa clara,
4,06085502204,085,santa clara,
...,...,...,...,...
9124,06059001303,059,orange,
9125,06059001304,059,orange,
9126,06059001401,059,orange,
9127,06013367200,013,contra costa,0.0


In [30]:
# save metric calculated data as separate csvs for upload
drought_crop_loss_metric = drought_loss_merge[['tract', 'county', 'estimated_crop_loss_cost']]
drought_crop_loss_metric.to_csv('climate_drought_crop_loss_metric.csv', index=False)

In [None]:
@append_metadata
def crop_loss_upload(input_csv, export=False, varname=''):
    '''
    Uploads the crop loss metrics to S3 bucket. The metrics are:
    * estimated monetary crop damage due to extreme heat and drought (respectively)
    
    Data for this metric was sourced from NOAA's - Storm event database:
    https://www.ncdc.noaa.gov/stormevents/

    Methods
    -------
    Relevant columns to our data metrics were isolated, various spatial options were considered, but county fips had the fewest NaN data.
    Data was isolated to include relevant storm events.
    Duplicates were removed based on storm event ID.
    Data was isolated to included populated county entries.
    Data were grouped by county and summed to calculate final metrics.
    
    Parameters
    ----------
    input_csv: string
        csv crop loss data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI crop loss metrics to AWS
        True = will upload resulting df containing CAL CRAI crop loss metrics to AWS

    Script
    ------
    climate_flood_loss.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns were isolated and renamed')
    print('Data transformation: duplicate entries by event ID were dropped.')
    print('Data transformation: data was grouped by county and summed.')
    print('Data transformation: data was merged with California census tracts.') 
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [None]:
input_csvs = ['climate_heat_crop_loss_metric.csv',
               'climate_drought_crop_loss_metric.csv']

varnames = ['climate_noaa_heat_crop_cost',
            'climate_noaa_drought_crop_cost',
            ]

# Process the data and export
for input_csv, varname in zip(input_csvs, varnames):
    print(f'Processing {input_csv} with varname {varname}')
    crop_loss_upload(input_csv, export=True, varname='test')
    print(f'Completed uploading {input_csv} with varname {varname}')