## Cal-CRAI Metric Calculation
Domain: Built Environment \
Indicator: Untilites

This notebook calculates one metric, sourced from the California State Water Resources Control Board:
* Metric 1: Number of operating wastewater treatment facilities per California county

In [28]:
import os
import sys
import pandas as pd
import io
import numpy as np
import geopandas as gpd

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws, filter_counties
)

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/built_environment/utilities/ca_wrcb/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)


### Per the data source in reference to the 'Total' column:
 
 *Some facilities may be related to multiple programs. This field shows a unique count of facilities so the total across a row many not sum.

 I think this means the total value contains exclusively wastewater treatment facilities


In [None]:
wastewater_facilities_data = pd.read_csv('swcrb_wastewater_treatment_facilities.csv')
wastewater_facilities_data.head()

In [None]:
sum_columns = ['CER','DOD','IRRI','NPDES','SSO','TANKS','WDR'] #excluding LNDISP
wastewater_facilities_data['total_all_facilities'] = wastewater_facilities_data[sum_columns].sum(axis=1)
wastewater_facilities_data = wastewater_facilities_data.rename(columns={'County':'county', 'Total':'num_exclusive_wastewater_facilities'})
wastewater_facilities_data = wastewater_facilities_data.drop(columns='LNDISP')
wastewater_facilities_data.head()

In [None]:
filtered_wastewater_facilities = wastewater_facilities_data[['county', 'num_exclusive_wastewater_facilities', 'total_all_facilities']]
filtered_wastewater_facilities = filtered_wastewater_facilities.applymap(lambda s: s.lower() if type(s) == str else s)

filtered_wastewater_facilities.head()

In [41]:
further_filtered_wastewater_facilities, omitted_rows = filter_counties(filtered_wastewater_facilities, 'county')

### All entires are counties except the total values at the end of the data, so we can continue to use filtered_wastewater_facilities when merging to California tract data

In [None]:
omitted_rows

### Call in California tract and county data and adjust the columns and entries to match our wastewater facility data for merging

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'countyfp'})

ca_tract_county

### Merge CA tract and county data with cleaned wastewater treatement facility data
* we have decided to use the original total column (now called num_exclusive_wastewater_facilities) for our metric

In [None]:
merged_wastewater_facilities = pd.merge(ca_tract_county, filtered_wastewater_facilities, on='county', how='left')
merged_wastewater_facilities = merged_wastewater_facilities.drop(columns='total_all_facilities')
merged_wastewater_facilities

In [44]:
# save final df to csv for upload
merged_wastewater_facilities.to_csv('built_wastewater_treatment_facilities_metric.csv', index=False)

## Function call

In [45]:
@append_metadata
def wastewater_treatment_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared wastewater treatment facilities metric csv to S3 bucket. The metric is:
    # of operating wastewater treatment facilities per California county.

    Data for this metric was sourced from California State Water Resources Control Board at:
    https://www.waterboards.ca.gov/water_issues/programs/npdes/permit_search.html within the Interactive
    Regulated Facilities Report.

    Methods
    -------
    The original data file was a .xls file, which was converted to a .csv for easier processing.
    Total facility columns and the county column were retained and merged with California 2021 tiger census tract and county data.
    
    Parameters
    ----------
    input_csv: string
        csv wastewater facility data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI wastewater treatment facility metric to AWS
        True = will upload resulting df containing CAL CRAI wastewater treatment facility metric to AWS

    Script
    ------
    built_wastewater_treatment.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by isolating and renaming relevant columns.')
    print('Data transformation: data was merged to California census tract and county data.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print('built_wastewater_treatment_facilities_metric.csv uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [46]:
built_wastewater_csv='built_wastewater_treatment_facilities_metric.csv'
var = 'built_swcrb_wastewater_facilities'

wastewater_treatment_upload(built_wastewater_csv, export=True, varname='test')