### Cal-CRAI Metric Calculation for: Natural Systems / Soil Health Metrics
This notebook calculates 3 metrics, all sourced from the United States Department of Agriculture web soil survey.
* % of soil cover rated fragile
* % of soil rated moderately or severely drought vulnerable
* % of soil moderately or severely susceptible to fire damage

In [30]:
import pandas as pd
import os
import sys
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [31]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/natural_systems/ecosystem_condition/usda/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'usda_web_soil_survey_drought_vulnerable.csv'
Saved DataFrame as 'usda_web_soil_survey_fire_damage_susceptibility.csv'
Saved DataFrame as 'usda_web_soil_survey_fragile_soils.csv'
Saved DataFrame as 'usda_web_soil_survey_notes.csv'


In [32]:
# Read in data
drought_vulnerable_data = pd.read_csv('usda_web_soil_survey_drought_vulnerable.csv')
fire_susceptibility_data = pd.read_csv('usda_web_soil_survey_fire_damage_susceptibility.csv')
fragile_soils_data = pd.read_csv('usda_web_soil_survey_fragile_soils.csv')

In [33]:
# Take a look at all columns between the three datasets
print(drought_vulnerable_data.columns)
print(fire_susceptibility_data.columns)
print(fragile_soils_data.columns)

Index(['County', 'Rating', 'Acres in AOI', 'Percent of AOI'], dtype='object')
Index(['County', 'Rating', 'Acres in AOI', 'Percent of AOI'], dtype='object')
Index(['County', 'Rating', 'Acres in AOI', 'Percent of AOI'], dtype='object')


In [34]:
drought_vulnerable_data.head(5)

Unnamed: 0,County,Rating,Acres in AOI,Percent of AOI
0,Alameda,Drought vulnerable,162384.4,49.10%
1,Alameda,Severely drought vulnerable,143901.5,43.60%
2,Alameda,Moderately drought vulnerable,4812.1,1.50%
3,Alameda,Somewhat drought vulnerable,2560.2,0.80%
4,Alameda,Slightly drought vulnerable,626.0,0.20%


In [35]:
fire_susceptibility_data.head()

Unnamed: 0,County,Rating,Acres in AOI,Percent of AOI
0,Alameda,Highly susceptible,187902.8,56.90%
1,Alameda,Moderately susceptible,96526.0,29.20%
2,Alameda,Slightly susceptible,28841.7,8.70%
3,Alameda,Null or Not Rated,17130.7,5.20%
4,Amador,Moderately susceptible,163784.7,54.90%


In [36]:
fragile_soils_data.head(50)

Unnamed: 0,County,Rating,Acres in AOI,Percent of AOI
0,Alameda,Fragile,126906.5,38.40%
1,Alameda,Highly fragile,88751.5,26.90%
2,Alameda,Moderately fragile,67440.1,20.40%
3,Alameda,Slightly fragile,5251.6,1.60%
4,Alameda,Null or Not Rated,42051.5,12.70%
5,Amador,Moderately fragile,6749.8,2.30%
6,Amador,Slightly fragile,2369.5,0.80%
7,Amador,Fragile,16.1,0.00%
8,Amador,Null or Not Rated,289307.3,96.90%
9,Butte,Slightly fragile,433375.2,46.20%


## Function to clean all three datasets and calculate soil metrics
* removes % sign and convert to numeric so calculations can be performed
* while all three datasets share columns, their entries within have some variance, 
so we group all desired entries with the 'Ratings' column to be isolated for each dataset
* the percentage column is summed within counties that have applicable ratings.
* some counties have multiple entries (usually indicating a split within the county, east and west for example),
so percentage sums are then averaged to estimate soil vulnerability between the sub-county split.

In [37]:
# List of datasets
all_data = [drought_vulnerable_data, fire_susceptibility_data, fragile_soils_data]

# Remove '%' symbol and convert to numeric for each DataFrame in all_data
for i in range(len(all_data)):
    all_data[i]['Percent of AOI'] = all_data[i]['Percent of AOI'].str.rstrip('%').astype(float)

# List of ratings to filter
ratings = [
    'Fragile', 'Highly fragile', 'Extremely fragile', 'Moderately fragile',
    'Moderately susceptible', 'Highly susceptible',
    'Moderately drought vulnerable', 'Severely drought vulnerable',
    'Drought vulnerable'
]

# Initialize an empty list to store results
result_list = []

# Function to clean and average county data
def clean_and_average_counties(df):
    df['Main County'] = df['County'].str.split(',').str[0]
    df_cleaned = df.groupby('Main County')['Percent of AOI'].mean().reset_index()
    df_cleaned.rename(columns={'Main County': 'county', 'Percent of AOI':'percent_vulnerable'}, inplace=True)
    return df_cleaned

# Loop through each dataset
for data in all_data:
    # Filter the dataset based on the 'Rating' values
    filtered_data = data[data['Rating'].isin(ratings)]
    
    # Group by 'County' and sum the 'percent_vulnerable'
    grouped_data = filtered_data.groupby('County')['Percent of AOI'].sum().reset_index()
    
    # Clean and average counties
    cleaned_data = clean_and_average_counties(grouped_data)
    
    # Lower case all counties
    cleaned_data = cleaned_data.applymap(lambda s: s.lower() if type(s) == str else s)

    # Append the result to the list
    result_list.append(cleaned_data)

# Each element in result_list is a DataFrame with cleaned and averaged 'percent_vulnerable' per county for each dataset
drought_vulnerable_result = result_list[0]
fire_susceptibility_result = result_list[1]
fragile_soils_result = result_list[2]

# Display the results
print("Drought Vulnerable Data Summed and Cleaned:")
print(drought_vulnerable_result)

print("\nFire Susceptibility Data Summed and Cleaned:")
print(fire_susceptibility_result)

print("\nFragile Soils Data Summed and Cleaned:")
print(fragile_soils_result)


Drought Vulnerable Data Summed and Cleaned:
             county  percent_vulnerable
0           alameda           94.200000
1            amador           93.800000
2             butte           47.600000
3            colusa           67.500000
4      contra costa           72.300000
5         el dorado           79.500000
6            fresno           97.600000
7             glenn           79.700000
8          humboldt            2.050000
9          imperial           98.300000
10             kern           94.700000
11            kings           80.700000
12             lake           87.600000
13      los angeles           77.150000
14           madera           95.800000
15            marin           72.100000
16         mariposa           93.200000
17        mendocino           42.400000
18           merced           86.400000
19            modoc           83.700000
20         monterey           89.900000
21             napa           83.700000
22           nevada           54.800

  cleaned_data = cleaned_data.applymap(lambda s: s.lower() if type(s) == str else s)
  cleaned_data = cleaned_data.applymap(lambda s: s.lower() if type(s) == str else s)
  cleaned_data = cleaned_data.applymap(lambda s: s.lower() if type(s) == str else s)


In [38]:
# Ensure there arent non-applicable county entries with our filter_counties function
# Ran on all three resulting dfs, no non-applicable entries
filtered, omitted = filter_counties(fragile_soils_result, 'county')
omitted

Unnamed: 0,county,percent_vulnerable


In [39]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'COUNTYFP'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

  ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)


Unnamed: 0,tract,county
0,06085504321,santa clara
1,06085504410,santa clara
2,06085507003,santa clara
3,06085507004,santa clara
4,06085502204,santa clara
...,...,...
9124,06059001303,orange
9125,06059001304,orange
9126,06059001401,orange
9127,06013367200,contra costa


## For each resulting df:
* rename the percent column to indicate the metric
* merge df with CA tract data based on shared county

In [40]:
drought_vulnerable_result = drought_vulnerable_result.rename(columns={'percent_vulnerable':'percent_vulnerable_drought'})
drought_metric = pd.merge(ca_tract_county, drought_vulnerable_result, on='county', how='left')
print(len(drought_metric))
print(drought_metric.head())

fire_susceptibility_result = fire_susceptibility_result.rename(columns={'percent_vulnerable':'percent_vulnerable_fire'})
fire_soil_metric = pd.merge(ca_tract_county, fire_susceptibility_result, on='county', how='left')
print(len(fire_soil_metric))
print(fire_soil_metric.head())

fragile_soils_result = fragile_soils_result.rename(columns={'percent_vulnerable':'percent_vulnerable_soils'})
fragile_soil_metric = pd.merge(ca_tract_county, fragile_soils_result, on='county', how='left')
print(len(fragile_soil_metric))
print(fragile_soil_metric.head())

9129
         tract       county  percent_vulnerable_drought
0  06085504321  santa clara                        71.6
1  06085504410  santa clara                        71.6
2  06085507003  santa clara                        71.6
3  06085507004  santa clara                        71.6
4  06085502204  santa clara                        71.6
9129
         tract       county  percent_vulnerable_fire
0  06085504321  santa clara                     59.7
1  06085504410  santa clara                     59.7
2  06085507003  santa clara                     59.7
3  06085507004  santa clara                     59.7
4  06085502204  santa clara                     59.7
9129
         tract       county  percent_vulnerable_soils
0  06085504321  santa clara                     56.85
1  06085504410  santa clara                     56.85
2  06085507003  santa clara                     56.85
3  06085507004  santa clara                     56.85
4  06085502204  santa clara                     56.85


## Save each resulting df as a CSV to upload to S3 bucket

In [42]:
drought_metric.to_csv('natural_soils_vulnerable_drought_metric.csv')
fire_soil_metric.to_csv('natural_soils_vulnerable_fire_metric.csv')
fragile_soil_metric.to_csv('natural_fragile_soils_metric.csv')

## Function Call

In [43]:
@append_metadata
def web_soil_survey_metric_upload(input_csv, export=False, varname=''):    
    '''
    Uploads three csv files that contain metric calculations for soil health within Cal-CRAI's Natural Systems Domain.
    Data was sourced from the USDA from: https://websoilsurvey.sc.egov.usda.gov/app/WebSoilSurvey.aspx

    Methods
    -------
    Each of the three datasets had the same columns, including the soil 'Rating'.
    Desired entries within the 'Rating' column for each dataset were listed and isolated for each dataset.
    Flagged ratings include: Fragile, Highly fragile, Extremely fragile, Moderately fragile, Moderately susceptible,
    Highly susceptible, Moderately drought vulnerable, Severely drought vulnerable.
    Counties were grouped up, with the percentage column being summed to estimate total percentage vulnerability for each dataset.
    Some counties were separated into sub-categories like 'Southern Humboldt' and 'Central Humboldt'. For counties with these multiple
    entries, their summed vulnerable percentages were averaged, and a single county entry was maintained.
    Data were then merged to California census tract data to attribute county level soil vulnerabilities to tracts residing within that county 
    
    Parameters
    ----------
    df: string
        the dataframe containing the initial soil data
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI soil metric to AWS
        True = will upload resulting df containing CAL CRAI soil metric to AWS
    import_csv: string
        name of the csv file to be uploaded to AWS

    Script
    ------
    natural_web_soil_survey.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data filtered for severity ratings.')
    print('Data transformation: average percentage values for multi-county entries.')
    print('Data transformation: merge data to California tracts.')

    bucket_name = 'ca-climate-index'
    directory = '3_fair_data/index_data'
    export_filename = [input_csv]

    if export == True:
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{export_filename} uploaded to AWS.')

    #if os.path.exists(input_csv):
    #   os.remove(input_csv)

In [45]:
input_csv = [
            'natural_fragile_soils_metric.csv',
            'natural_soils_vulnerable_drought_metric.csv',
            'natural_soils_vulnerable_fire_metric.csv',
            ]

varnames = [
    'natural_usda_soil_condition_1',
    'natural_usda_soil_condition_2',
    'natural_usda_soil_condition_3'
    ]

for csv, var in zip(input_csv, varnames):
    web_soil_survey_metric_upload(csv, export=True, varname='test')