## Cal-CRAI Metric Calculation
Domain: Governance \
Indicator: Natural Resources Conservation

This notebook calculates one metric, sourced from the California State Water Resources Control Board:
* Metric 1: Percent of domestic wells considered “high risk” for any of the following contaminants: Nitrate, Arsenic, 1,2,3-Trichloropropane, Hexavalent Chromium, Uranium


In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd
import dask_geopandas 
import pyogrio 

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/governance/natural_resource_conservation/ca_state_water_resources_board/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [None]:
# large file, so we break it up into partitions
water_well_data = dask_geopandas.read_file('GAMA_division_drinking_water_reproject.gpkg',npartitions=5)
# let's load in one partition so we can look at the data (up to about the first 1/5 of the rows)
df = water_well_data.partitions[0].compute()
df

In [None]:
# lots of columns! Can we get rid of some to save memory?
df.columns

In [5]:
# columns we need to calculate our metrics
subset = water_well_data[[
    'gm_well_id',
    'gm_chemical_name',
    'gm_result_modifier',
    'gm_result',
    'gm_chemical_units', 
    'gm_reporting_limit',
    'GEOID'
]]

In [None]:
# now that the dataframe is much smaller, we can load it in
subset_df = subset.compute()
subset_df

In [None]:
unique_chemical_names = subset_df['gm_chemical_name'].unique()
unique_chemical_names

In [None]:
# looking at how many tracts are
unique_tracts = subset_df['GEOID'].unique()
print(len(unique_tracts))

In [None]:
# looking at how many wells there are
unique_wells = subset_df['gm_well_id'].unique()
print(len(unique_wells))

In [None]:
# Group by both GEOID and gm_well_id, then count unique combinations
count_entries_geoid_wellid = subset_df.groupby(['GEOID', 'gm_well_id']).size().reset_index(name='num_sampled_wells')
final_count_entries_geoid = count_entries_geoid_wellid.groupby('GEOID').size().reset_index(name='num_sampled_wells')

final_count_entries_geoid

In [None]:
# checking this specific tract in Orange county as it had 561 rows, they are all the same well
orange_county = subset_df[subset_df['GEOID'] == '06059001303']
print(orange_county.head(5))

unique_wells_orange_county = orange_county['gm_well_id'].unique()
unique_wells_orange_county

In [None]:
# summing up the count column to make sure total well count matches above
total_well_count = final_count_entries_geoid['num_sampled_wells'].sum()

print("Total count of entries:", total_well_count)

In [None]:
# subsetting data to desired contaminants
list_of_contaminants = ['Nitrate as N',
                        'Arsenic',
                        '1,2,3-Trichloropropane (1,2,3 TCP)',
                        'Chromium, Hexavalent (Cr6)',
                        'Uranium']

contaminant_data = subset_df[subset_df['gm_chemical_name'].isin(list_of_contaminants)]
contaminant_data

In [None]:
# further filter data to entries that exceed the reporting limit
condition1 = contaminant_data['gm_result_modifier'].isin(['>'])
condition2 = contaminant_data['gm_result'] > contaminant_data['gm_reporting_limit']
# Apply filters
filtered_df = contaminant_data[condition1 | condition2]

# Display the filtered DataFrame
filtered_df

In [None]:
# group by both GEOID and gm_well_id, then count unique combinations
count_contaminate_geoid = filtered_df.groupby(['GEOID', 'gm_well_id']).size().reset_index(name='num_contaminated_wells')
final_count_contaminant_geoid = count_contaminate_geoid.groupby('GEOID').size().reset_index(name='num_contaminated_wells')

final_count_contaminant_geoid


In [None]:
# merge total unique wells per tract and contaminated wells per tract
well_merge = pd.merge(final_count_entries_geoid, final_count_contaminant_geoid, on='GEOID', how='left')

# calculate percentage of wells contaminated per well
well_merge['percent_sampled_wells_contaminated'] = (well_merge['num_contaminated_wells'] / well_merge['num_sampled_wells']) * 100

# replace NaN values with 0, as all tracts within this df had been sampled from, so na for contaminants is truly 0
well_merge.fillna(0, inplace=True)

well_merge

In [None]:
# checking to make sure contaminated wells does not exceed total wells
check = well_merge['num_contaminated_wells'] > well_merge['num_sampled_wells']
true_rows = well_merge[check]
true_rows

In [32]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county = ca_tract_county.rename(columns={'TRACT':'GEOID'})

In [None]:
ca_tract_county

In [None]:
# merge CA census tract with final data metric df
# leave nans as there was no data for those tracts
well_merge_ca_tracts = pd.merge(ca_tract_county, well_merge, on='GEOID', how='left')
well_merge_ca_tracts

In [82]:
well_merge_ca_tracts.to_csv('governance_well_quality_metric.csv', index=False)

## Function Call

In [83]:
@append_metadata
def gama_well_water_quality_upload(input_csv, export=False, varname=''):
    '''
    Uploads the water well quality metric to S3 bucket. The metric is:
    
    * Percentage of domestic wells considered “high risk” for any of the following contaminants:
      Nitrate, Arsenic, 1,2,3-Trichloropropane, Hexavalent Chromium, Uranium

    Data for this metric was sourced from the California State Water Resources Control Board: GAMA - Division of Drinking Water at
    https://data.ca.gov/dataset/ground-water-water-quality-results/resource/d2e74ace-2cf4-4baf-aadd-406280bf1c1c?inner_span=True

    Methods
    -------
    Relevant data columns were isolated, some were renamed for later merging with California tract data.
    Total number of wells sampled per census tract were retained for the percentage calculation.
    Specific contaminants (per the metric) were isolated for.
    Sample levels exceeding the 'gm_reporting_limit' column were further retained and counted per California tract.
    Total wells sampled and total entries exceeding desired contaminants were merged together per census tract.
    Number of samples exceeding the threshold were divided by number of wells sampled, then multiplied by 100 to calculate
    percentage.
    Tracts missing data were left as nan, as missing tracts were either not sampled from, or had missing data
    
    Parameters
    ----------
    input_csv: string
        csv water well quality metric data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI water well quality metric to AWS
        True = will upload resulting df containing CAL CRAI water well quality metric to AWS

    Script
    ------
    governance_gama_wells.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: relevant columns and contaminants were isolated and renamed.')
    print('Data transformation: data was isolated when above its predefined threshold.')
    print('Data transformation: data were then merged to California census tracts.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    '''if os.path.exists(input_csv):
        os.remove(input_csv)'''

In [84]:
input_csv = 'governance_well_quality_metric.csv'
variable = 'governance_swcrb_groundwater_quality'

gama_well_water_quality_upload(input_csv=input_csv, export=True, varname='test')