In [1]:
import pandas as pd
import os
import sys
import numpy as np
import boto3
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/built_environment/utilities/pse_health_energy/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'public_safety_power_shutoff_frequency.csv'


In [3]:
# read in food access data (already subsetted for CA)
power_shutoff_data = pd.read_csv('public_safety_power_shutoff_frequency.csv')
print(len(power_shutoff_data))
power_shutoff_data = power_shutoff_data.rename(columns={'Fips':'GEOID'})
#os.remove('public_safety_power_shutoff_frequency.csv')

8033


In [4]:
power_shutoff_data

Unnamed: 0,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,GEOID,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL
0,Unincorporated Sonoma County area,109,2338,13,96+ hrs (4+ days),6097154302,0.4,PG&E,MultiPolygon,38.374417,-122.995254,12,"$80,341",20%
1,Unincorporated Sonoma County area,98,1760,19,96+ hrs (4+ days),6097153706,0.4,PG&E,Polygon,38.474223,-122.913838,12,"$64,746",27%
2,Unincorporated Sonoma County area,119,1650,17,96+ hrs (4+ days),6097153600,0.4,PG&E,Polygon,38.419728,-122.890941,11,"$102,647",21%
3,Unincorporated Sonoma County area,113,2165,7,96+ hrs (4+ days),6097153502,0.4,PG&E,Polygon,38.344901,-122.879409,11,"$94,023",24%
4,Unincorporated Sonoma County area,109,3371,8,96+ hrs (4+ days),6097153501,0.2,PG&E,Polygon,38.340157,-122.800565,13,"$98,598",11%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,Alameda,Unknown,Unknown,39,Does not report,6001427700,Unknown,Non-reporting utility,MultiPolygon,37.769823,-122.285105,7,"$101,678",15%
8029,Alameda,Unknown,Unknown,69,Does not report,6001427600,Unknown,Non-reporting utility,Polygon,37.777607,-122.284672,7,"$70,650",33%
8030,Alameda,Unknown,Unknown,62,Does not report,6001427300,Unknown,Non-reporting utility,Polygon,37.781250,-122.265201,7,"$94,939",20%
8031,Alameda,Unknown,Unknown,66,Does not report,6001427200,Unknown,Non-reporting utility,MultiPolygon,37.775939,-122.248954,7,"$77,375",27%


## It is using older tract data, so we will join it with 2017 Tract data first

In [5]:
# read in CA census tiger file
old_census_path = "s3://ca-climate-index/0_map_data/tl_2017_06_tract/"
ca_old = gpd.read_file(old_census_path)
ca_old['GEOID'] = pd.to_numeric(ca_old.GEOID)
ca_old = ca_old[["GEOID","geometry"]]

In [6]:
old_tract_power_shutoff_data = pd.merge(ca_old, power_shutoff_data, on="GEOID")
old_tract_power_shutoff_data = gpd.GeoDataFrame(old_tract_power_shutoff_data, geometry="geometry")

In [7]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
new_column_names = ["USCB_"+column for column in column_names if column != "geometry"]
ca_boundaries = ca_boundaries.rename(columns=dict(zip(column_names, new_column_names)))
# drop unnecessary columns
ca_boundaries = ca_boundaries[["geometry","USCB_GEOID"]]
ca_boundaries

Unnamed: 0,geometry,USCB_GEOID
0,"POLYGON ((-121.87556 37.39924, -121.87535 37.3...",06085504321
1,"POLYGON ((-121.88886 37.40758, -121.88576 37.4...",06085504410
2,"POLYGON ((-122.02489 37.21683, -122.02459 37.2...",06085507003
3,"POLYGON ((-121.99304 37.22562, -121.99249 37.2...",06085507004
4,"POLYGON ((-121.93167 37.29803, -121.92801 37.3...",06085502204
...,...,...
9124,"POLYGON ((-117.95917 33.92458, -117.95888 33.9...",06059001303
9125,"POLYGON ((-117.95918 33.92820, -117.95831 33.9...",06059001304
9126,"POLYGON ((-117.95056 33.94503, -117.95055 33.9...",06059001401
9127,"POLYGON ((-122.34551 37.96355, -122.34550 37.9...",06013367200


In [8]:
old_tract_power_shutoff_data = old_tract_power_shutoff_data.to_crs(crs=4269) 
ca_boundaries = ca_boundaries.to_crs(crs=4269) 

In [21]:
joined_df = gpd.sjoin_nearest(
    ca_boundaries, old_tract_power_shutoff_data, 
    how="inner", distance_col="distances", 
    max_distance=5000
)
joined_df




In [10]:
'''data_vars = ['out_freq_s'
            ]
for col in data_vars:
    non_numeric = joined_df[~joined_df[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column '{col}':")
        display(non_numeric)
for col in data_vars:
    joined_df[col] = pd.to_numeric(joined_df[col], errors='coerce')'''

Non-numeric values found in column 'out_freq_s':


Unnamed: 0,geometry,USCB_GEOID,index_right,GEOID,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL,distances
11,"POLYGON ((-121.96008 37.40986, -121.95941 37.4...",06085505012,4258,6085504901,Santa Clara,Unknown,Unknown,25,Does not report,Unknown,Non-reporting utility,Polygon,37.397201,-121.950094,13,"$161,505",12%,0.0
13,"POLYGON ((-121.98711 37.40294, -121.98706 37.4...",06085505010,4258,6085504901,Santa Clara,Unknown,Unknown,25,Does not report,Unknown,Non-reporting utility,Polygon,37.397201,-121.950094,13,"$161,505",12%,0.0
16,"POLYGON ((-121.95267 37.39652, -121.95239 37.3...",06085504903,4258,6085504901,Santa Clara,Unknown,Unknown,25,Does not report,Unknown,Non-reporting utility,Polygon,37.397201,-121.950094,13,"$161,505",12%,0.0
1891,"POLYGON ((-121.96404 37.38192, -121.96395 37.3...",06085505007,4258,6085504901,Santa Clara,Unknown,Unknown,25,Does not report,Unknown,Non-reporting utility,Polygon,37.397201,-121.950094,13,"$161,505",12%,0.0
6575,"POLYGON ((-121.96641 37.40639, -121.96632 37.4...",06085505011,4258,6085504901,Santa Clara,Unknown,Unknown,25,Does not report,Unknown,Non-reporting utility,Polygon,37.397201,-121.950094,13,"$161,505",12%,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9002,"POLYGON ((-120.49124 37.29082, -120.49102 37.2...",06047001601,4034,6047001401,Merced,Unknown,Unknown,92,Does not report,Unknown,Non-reporting utility,Polygon,37.296592,-120.461766,26,"$42,193",61%,0.0
9027,"POLYGON ((-120.48697 37.29827, -120.48682 37.2...",06047001302,4034,6047001401,Merced,Unknown,Unknown,92,Does not report,Unknown,Non-reporting utility,Polygon,37.296592,-120.461766,26,"$42,193",61%,0.0
9029,"POLYGON ((-120.46801 37.29491, -120.46798 37.2...",06047001700,4034,6047001401,Merced,Unknown,Unknown,92,Does not report,Unknown,Non-reporting utility,Polygon,37.296592,-120.461766,26,"$42,193",61%,0.0
9049,"MULTIPOLYGON (((-119.05291 33.47917, -119.0518...",06083980100,4103,6083980100,Unincorporated Santa Barbara County area,Unknown,Unknown,No data,Does not report,Unknown,Non-reporting utility,MultiPolygon,33.982349,-119.908616,12,,100%,0.0


In [17]:
data_vars = ['out_freq_s'
            ]
# now take the average of the tracts which now exist in the new tract
joined_avg_df = joined_df.groupby(['USCB_GEOID','geometry'])[data_vars].mean().reset_index()
power_shutoff_new_tracts = gpd.GeoDataFrame(joined_avg_df, geometry='geometry')
power_shutoff_new_tracts = power_shutoff_new_tracts.drop(columns={'geometry'})
power_shutoff_new_tracts

Unnamed: 0,USCB_GEOID,out_freq_s
0,06001400100,0.620000
1,06001400200,0.466667
2,06001400300,0.377778
3,06001400400,0.400000
4,06001400500,0.333333
...,...,...
9124,06115040902,1.133333
9125,06115041001,0.925000
9126,06115041002,2.000000
9127,06115041101,2.285714


In [18]:
nan_checking = pd.isna(power_shutoff_new_tracts['out_freq_s'])


In [19]:
nan_rows = power_shutoff_new_tracts[nan_checking]
print(nan_rows)


       USCB_GEOID  out_freq_s
163   06001427600         NaN
164   06001427700         NaN
165   06001427800         NaN
166   06001427900         NaN
167   06001428000         NaN
...           ...         ...
8702  06099003906         NaN
8703  06099003907         NaN
8704  06099003908         NaN
8705  06099003909         NaN
9055  06111980000         NaN

[1480 rows x 2 columns]


In [20]:
power_shutoff_new_tracts.to_csv('built_power_shutoffs_metric.csv', index=False)

In [15]:
#@append_metadata
def power_shutoff_upload(input_csv, export=False, varname=''):
    '''
    Uploads the calculated Public Safety Power Shutoff (PSPS) metric to S3 bucket. The metrics is:
    Frequency of PSPS events per California census tract.

    Data for this metric was sourced from PSE Healthy Energy at:
    https://www.psehealthyenergy.org/work/california-public-safety-power-shutoff-interactive-map/ from the 
    PSPS Duration by Census Tract section

    Methods
    -------
    The data was from older census tracts, so we merged it with 2017 California Tiger shape files first.
    The data was then set to Cal-CRAI standardized coordinate reference system (CRS) 4269.
    Data was then spatially joined to the nearest 2021 census tract data.
    Data were then grouped spatially and had the PSPS frequency data averaged per census tracts.
    
    Parameters
    ----------
    input_csv: string
        csv PSPS data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI PSPS metric to AWS
        True = will upload resulting df containing CAL CRAI PSPS metric to AWS

    Script
    ------
    built_power_shutoff.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: merged to 2017 census tracts.')
    print('Data transformation: reprojected to CRS 4269.')
    print('Data transformation: spatially joined to 2021 census tracts.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [16]:
input_csv = 'built_power_shutoffs_metric.csv'
varname = 'built_pse_power_shutoff'

power_shutoff_upload(input_csv, export=True, varname='test')

Data transformation: merged to 2017 census tracts.
Data transformation: reprojected to CRS 4269.
Data transformation: spatially joined to 2021 census tracts.
built_power_shutoffs_metric.csv uploaded to AWS
