## Built Domain: Wastewater Treatment Facility Metric Calculation
* number of operating wastewater treatment facilities per California county

The data has totals from facilities exclusively for wastewater and for facilities that have wastewater as part of their overall facilty (I think) so we just need to decide which total to use


In [56]:
import os
import sys
import pandas as pd
import io
import numpy as np
import geopandas as gpd

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import (
    append_metadata
)
from scripts.utils.file_helpers import (
    pull_csv_from_directory, upload_csv_aws, filter_counties
)

In [28]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/built_environment/utilities/ca_wrcb/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)


Saved DataFrame as 'swcrb_wastewater_treatment_facilities.csv'


### Per the data source in reference to the 'Total' column:
 
 *Some facilities may be related to multiple programs. This field shows a unique count of facilities so the total across a row many not sum.

 I think this means the total value contains exclusively wastewater treatment facilities


In [30]:
wastewater_facilities_data = pd.read_csv('swcrb_wastewater_treatment_facilities.csv')
wastewater_facilities_data.head()

Unnamed: 0,County,CER,DOD,IRRI,LNDISP,NPDES,SSO,TANKS,WDR,Total
0,Alameda,0,0,0,0,10,0,0,7,16
1,Alpine,0,0,0,0,1,0,0,5,5
2,Amador,0,0,0,0,1,0,0,14,15
3,Butte,0,0,0,0,2,0,2,13,15
4,Calaveras,0,0,0,0,4,0,1,23,26


In [44]:
sum_columns = ['CER','DOD','IRRI','LNDISP','NPDES','SSO','TANKS','WDR']
wastewater_facilities_data['total_all_facilities'] = wastewater_facilities_data[sum_columns].sum(axis=1)
wastewater_facilities_data = wastewater_facilities_data.rename(columns={'County':'county', 'Total':'exclusive_wastewater_facilities'})
wastewater_facilities_data.head()

Unnamed: 0,county,CER,DOD,IRRI,LNDISP,NPDES,SSO,TANKS,WDR,exclusive_wastewater_facilities,total_all_facilities
0,Alameda,0,0,0,0,10,0,0,7,16,17
1,Alpine,0,0,0,0,1,0,0,5,5,6
2,Amador,0,0,0,0,1,0,0,14,15,15
3,Butte,0,0,0,0,2,0,2,13,15,17
4,Calaveras,0,0,0,0,4,0,1,23,26,28


In [54]:
filtered_wastewater_facilities = wastewater_facilities_data[['county', 'exclusive_wastewater_facilities', 'total_all_facilities']]
filtered_wastewater_facilities = filtered_wastewater_facilities.applymap(lambda s: s.lower() if type(s) == str else s)

filtered_wastewater_facilities.head()

Unnamed: 0,county,exclusive_wastewater_facilities,total_all_facilities
0,alameda,16,17
1,alpine,5,6
2,amador,15,15
3,butte,15,17
4,calaveras,26,28


In [57]:
further_filtered_wastewater_facilities, omitted_rows = filter_counties(filtered_wastewater_facilities, 'county')

## All entires are counties except the total values at the end of the data, so we can continue to use filtered_wastewater_facilities when merging to California tract data

In [59]:
omitted_rows

Unnamed: 0,county,exclusive_wastewater_facilities,total_all_facilities
58,total,1119,1435


## Call in California tract and county data and adjust the columns and entries to match our wastewater facility data for merging

In [51]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry', 'countyfp'})

ca_tract_county

Unnamed: 0,tract,county
0,06085504321,santa clara
1,06085504410,santa clara
2,06085507003,santa clara
3,06085507004,santa clara
4,06085502204,santa clara
...,...,...
9124,06059001303,orange
9125,06059001304,orange
9126,06059001401,orange
9127,06013367200,contra costa


## Merge CA tract and county data with cleaned wastewater treatement facility data

In [61]:
merged_wastewater_facilities = pd.merge(ca_tract_county, filtered_wastewater_facilities, on='county', how='left')
merged_wastewater_facilities

Unnamed: 0,tract,county,exclusive_wastewater_facilities,total_all_facilities
0,06085504321,santa clara,13,15
1,06085504410,santa clara,13,15
2,06085507003,santa clara,13,15
3,06085507004,santa clara,13,15
4,06085502204,santa clara,13,15
...,...,...,...,...
9124,06059001303,orange,23,46
9125,06059001304,orange,23,46
9126,06059001401,orange,23,46
9127,06013367200,contra costa,21,27


In [62]:
# save final df to csv for upload
merged_wastewater_facilities.to_csv('built_wastewater_treatment_facilities_metric.csv', index=False)

## Function call

In [63]:
@append_metadata
def wastewater_treatment_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared wastewater treatment facilities metric csv to S3 bucket. The metrics is:
    # of operating wastewater treatment facilities per California county.

    Data for this metric was sourced from California State Water Resources Control Board at:
    https://www.waterboards.ca.gov/water_issues/programs/npdes/permit_search.html within the Interactive Regulated Facilities Report.

    Methods
    -------
    Columns were adjusted and summed to calculate total facilities that include wastewater treatment.
    Total facility columns and the county column were retained and merged with California 2021 tiger census tract and county data.
    
    Parameters
    ----------
    input_csv: string
        csv wastewater facility data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI wastewater treatment facility metric to AWS
        True = will upload resulting df containing CAL CRAI wastewater treatment facility metric to AWS

    Script
    ------
    built_wastewater_treatment.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by isolating and renaming relevant columns.')
    print('Data transformation: an additional column was calculated by summing all columns.')
    print('Data transformation: data was merged to California census tract and county data.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [64]:
built_wastewater_csv='built_wastewater_treatment_facilities_metric.csv'
var = 'built_swcrb_wastewater_facilities'

wastewater_treatment_upload(built_wastewater_csv, export=True, varname='test')