## Cal-CRAI Metric Calculation for: Built Environment / PSPS event frequency
* average Public Safety Power Shutoff (PSPS) events 

In [1]:
import pandas as pd
import os
import sys
import numpy as np
import boto3
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '1_pull_data/built_environment/utilities/pse_health_energy/'

pull_csv_from_directory(bucket_name, aws_dir, search_zipped=False)

Saved DataFrame as 'public_safety_power_shutoff_frequency.csv'


In [3]:
# read in food access data (already subsetted for CA)
power_shutoff_data = pd.read_csv('public_safety_power_shutoff_frequency.csv')
print(len(power_shutoff_data))
power_shutoff_data = power_shutoff_data.rename(columns={'Fips':'GEOID'})
# os.remove('public_safety_power_shutoff_frequency.csv')

8033


In [4]:
power_shutoff_data

Unnamed: 0,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,GEOID,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL
0,Unincorporated Sonoma County area,109,2338,13,96+ hrs (4+ days),6097154302,0.4,PG&E,MultiPolygon,38.374417,-122.995254,12,"$80,341",20%
1,Unincorporated Sonoma County area,98,1760,19,96+ hrs (4+ days),6097153706,0.4,PG&E,Polygon,38.474223,-122.913838,12,"$64,746",27%
2,Unincorporated Sonoma County area,119,1650,17,96+ hrs (4+ days),6097153600,0.4,PG&E,Polygon,38.419728,-122.890941,11,"$102,647",21%
3,Unincorporated Sonoma County area,113,2165,7,96+ hrs (4+ days),6097153502,0.4,PG&E,Polygon,38.344901,-122.879409,11,"$94,023",24%
4,Unincorporated Sonoma County area,109,3371,8,96+ hrs (4+ days),6097153501,0.2,PG&E,Polygon,38.340157,-122.800565,13,"$98,598",11%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8028,Alameda,Unknown,Unknown,39,Does not report,6001427700,Unknown,Non-reporting utility,MultiPolygon,37.769823,-122.285105,7,"$101,678",15%
8029,Alameda,Unknown,Unknown,69,Does not report,6001427600,Unknown,Non-reporting utility,Polygon,37.777607,-122.284672,7,"$70,650",33%
8030,Alameda,Unknown,Unknown,62,Does not report,6001427300,Unknown,Non-reporting utility,Polygon,37.781250,-122.265201,7,"$94,939",20%
8031,Alameda,Unknown,Unknown,66,Does not report,6001427200,Unknown,Non-reporting utility,MultiPolygon,37.775939,-122.248954,7,"$77,375",27%


### It is using older tract data, so we will join it with 2017 Tract data first

In [5]:
# read in CA census tiger file
old_census_path = "s3://ca-climate-index/0_map_data/tl_2017_06_tract/"
ca_old = gpd.read_file(old_census_path)
ca_old['GEOID'] = pd.to_numeric(ca_old.GEOID)
ca_old = ca_old[["GEOID","geometry"]]

In [6]:
old_tract_power_shutoff_data = pd.merge(ca_old, power_shutoff_data, on="GEOID")
old_tract_power_shutoff_data = gpd.GeoDataFrame(old_tract_power_shutoff_data, geometry="geometry")

In [7]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"

ca_boundaries = gpd.read_file(census_shp_dir)
# need to rename columns so we don't have any duplicates in the final geodatabase
column_names = ca_boundaries.columns
new_column_names = ["USCB_"+column for column in column_names if column != "geometry"]
ca_boundaries = ca_boundaries.rename(columns=dict(zip(column_names, new_column_names)))
# drop unnecessary columns
ca_boundaries = ca_boundaries[["geometry","USCB_GEOID"]]
ca_boundaries

Unnamed: 0,geometry,USCB_GEOID
0,"POLYGON ((-121.87556 37.39924, -121.87535 37.3...",06085504321
1,"POLYGON ((-121.88886 37.40758, -121.88576 37.4...",06085504410
2,"POLYGON ((-122.02489 37.21683, -122.02459 37.2...",06085507003
3,"POLYGON ((-121.99304 37.22562, -121.99249 37.2...",06085507004
4,"POLYGON ((-121.93167 37.29803, -121.92801 37.3...",06085502204
...,...,...
9124,"POLYGON ((-117.95917 33.92458, -117.95888 33.9...",06059001303
9125,"POLYGON ((-117.95918 33.92820, -117.95831 33.9...",06059001304
9126,"POLYGON ((-117.95056 33.94503, -117.95055 33.9...",06059001401
9127,"POLYGON ((-122.34551 37.96355, -122.34550 37.9...",06013367200


In [8]:
# need to convert to an area-preserving CRS for distance calculations
old_tract_power_shutoff_data = old_tract_power_shutoff_data.to_crs(crs=3857) 
ca_boundaries = ca_boundaries.to_crs(crs=3857) 

In [9]:
# first find the tracts which have not changed from 2010 to 2017
# find the indices which correspond to the new boundaries
unchanged_tracts_ca = pd.to_numeric(ca_boundaries['USCB_GEOID']).isin(pd.to_numeric(old_tract_power_shutoff_data['GEOID']))
ca_boundaries[unchanged_tracts_ca]

Unnamed: 0,geometry,USCB_GEOID
0,"POLYGON ((-13567125.366 4494902.743, -13567102...",06085504321
1,"POLYGON ((-13568606.361 4496070.766, -13568261...",06085504410
80,"POLYGON ((-13610032.137 4542456.650, -13609960...",06001428301
81,"POLYGON ((-13610111.953 4542843.479, -13610095...",06001428302
82,"POLYGON ((-13134046.655 4012084.036, -13133879...",06059001801
...,...,...
9123,"POLYGON ((-13620661.625 4574732.401, -13620650...",06013366002
9124,"POLYGON ((-13131155.246 4018679.418, -13131122...",06059001303
9125,"POLYGON ((-13131155.692 4019165.343, -13131058...",06059001304
9126,"POLYGON ((-13130196.230 4021423.785, -13130195...",06059001401


In [10]:
# now find the indices which correspond to the original data
unchanged_tracts_old = pd.to_numeric(old_tract_power_shutoff_data['GEOID']).isin(pd.to_numeric(ca_boundaries['USCB_GEOID']))
original_df = old_tract_power_shutoff_data[unchanged_tracts_old]
original_df["USCB_GEOID"] = original_df["GEOID"].apply(lambda x: '{0:0>11}'.format(x))
original_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0,GEOID,geometry,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL,USCB_GEOID
0,6001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",Fremont,0,0,20,0 hrs,0,PG&E,Polygon,37.537088,-122.007720,14,"$146,480",5%,06001442700
1,6001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",Fremont,0,0,42,0 hrs,0,PG&E,Polygon,37.528257,-121.993324,14,"$122,609",10%,06001442800
2,6037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.016993,-118.198616,11,"$39,904",63%,06037204920
3,6037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",Los Angeles,Unknown,Unknown,99,Does not report,Unknown,Non-reporting utility,Polygon,34.024170,-118.213735,11,"$47,219",61%,06037205110
4,6037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.017619,-118.210036,11,"$26,844",82%,06037205120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8027,6013366002,"POLYGON ((-13620661.625 4574732.401, -13620650...",San Pablo,0,0,88,0 hrs,0,PG&E,Polygon,37.968909,-122.350191,9,"$58,869",41%,06013366002
8028,6059001303,"POLYGON ((-13131155.246 4018679.418, -13131122...",La Habra,0,0,73,0 hrs,0,SCE,Polygon,33.921470,-117.948702,14,"$85,343",28%,06059001303
8029,6059001304,"POLYGON ((-13131155.692 4019165.343, -13131058...",La Habra,0,0,92,0 hrs,0,SCE,Polygon,33.923312,-117.946225,14,"$68,601",47%,06059001304
8030,6059001401,"POLYGON ((-13130196.230 4021423.785, -13130195...",La Habra,0,0,75,0 hrs,0,SCE,Polygon,33.941269,-117.941078,18,"$74,688",38%,06059001401


In [11]:
# now we only have to join the remaining tracts
mapped_df = gpd.sjoin_nearest(
    ca_boundaries[~unchanged_tracts_ca], 
    old_tract_power_shutoff_data[~unchanged_tracts_old], 
    how="inner", distance_col="distances", 
    max_distance=5000
)
mapped_df

Unnamed: 0,geometry,USCB_GEOID,index_right,GEOID,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL,distances
2,"POLYGON ((-13583749.029 4469373.052, -13583714...",06085507003,3797,6085507001,Unincorporated Santa Clara County area,46,752,4,36-48 hrs,0.8,PG&E,Polygon,37.219862,-121.987481,12,"$190,313",7%,0.0
3,"POLYGON ((-13580203.383 4470602.095, -13580142...",06085507004,3797,6085507001,Unincorporated Santa Clara County area,46,752,4,36-48 hrs,0.8,PG&E,Polygon,37.219862,-121.987481,12,"$190,313",7%,0.0
4,"POLYGON ((-13573372.106 4480730.095, -13572964...",06085502204,3829,6085502201,San Jose,0,0,26,0 hrs,0,PG&E,Polygon,37.304231,-121.919424,13,"$70,398",32%,0.0
4,"POLYGON ((-13573372.106 4480730.095, -13572964...",06085502204,3828,6085502102,San Jose,0,0,46,0 hrs,0,PG&E,Polygon,37.312281,-121.925111,13,"$75,515",30%,0.0
5,"POLYGON ((-13572402.399 4481399.439, -13572401...",06085502203,3829,6085502201,San Jose,0,0,26,0 hrs,0,PG&E,Polygon,37.304231,-121.919424,13,"$70,398",32%,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9091,"POLYGON ((-13630134.800 4575526.449, -13630134...",06013990000,2304,6075017902,San Francisco,0,,88,0 hrs,0,PG&E,MultiPolygon,37.799135,-122.384789,8,"$54,375",67%,0.0
9091,"POLYGON ((-13630134.800 4575526.449, -13630134...",06013990000,7393,6013380000,Richmond,0,0,75,0 hrs,0,PG&E,Polygon,37.910149,-122.336490,7,"$80,110",28%,0.0
9091,"POLYGON ((-13630134.800 4575526.449, -13630134...",06013990000,2627,6013392200,Richmond,0,0,85,0 hrs,0,PG&E,Polygon,37.991742,-122.351910,12,"$84,604",21%,0.0
9128,"POLYGON ((-13149442.843 4000257.235, -13149307...",06037578100,1873,6037574700,Long Beach,0,0,No data,0 hrs,0,SCE,Polygon,33.778790,-118.119108,9,,0%,0.0


In [12]:
# then concatenate the sjoined tracts with the unchanged ones
joined_df = pd.concat([original_df,mapped_df])
joined_df

Unnamed: 0,GEOID,geometry,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL,USCB_GEOID,index_right,distances
0,6001442700,"POLYGON ((-13582893.983 4514550.527, -13582891...",Fremont,0,0,20,0 hrs,0,PG&E,Polygon,37.537088,-122.007720,14,"$146,480",5%,06001442700,,
1,6001442800,"POLYGON ((-13581234.219 4513218.736, -13581226...",Fremont,0,0,42,0 hrs,0,PG&E,Polygon,37.528257,-121.993324,14,"$122,609",10%,06001442800,,
2,6037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.016993,-118.198616,11,"$39,904",63%,06037204920,,
3,6037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",Los Angeles,Unknown,Unknown,99,Does not report,Unknown,Non-reporting utility,Polygon,34.024170,-118.213735,11,"$47,219",61%,06037205110,,
4,6037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.017619,-118.210036,11,"$26,844",82%,06037205120,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9091,6075017902,"POLYGON ((-13630134.800 4575526.449, -13630134...",San Francisco,0,,88,0 hrs,0,PG&E,MultiPolygon,37.799135,-122.384789,8,"$54,375",67%,06013990000,2304.0,0.0
9091,6013380000,"POLYGON ((-13630134.800 4575526.449, -13630134...",Richmond,0,0,75,0 hrs,0,PG&E,Polygon,37.910149,-122.336490,7,"$80,110",28%,06013990000,7393.0,0.0
9091,6013392200,"POLYGON ((-13630134.800 4575526.449, -13630134...",Richmond,0,0,85,0 hrs,0,PG&E,Polygon,37.991742,-122.351910,12,"$84,604",21%,06013990000,2627.0,0.0
9128,6037574700,"POLYGON ((-13149442.843 4000257.235, -13149307...",Long Beach,0,0,No data,0 hrs,0,SCE,Polygon,33.778790,-118.119108,9,,0%,06037578100,1873.0,0.0


In [13]:
data_vars = ['out_freq_s']
for col in data_vars:
    non_numeric = joined_df[~joined_df[col].apply(lambda x: pd.to_numeric(x, errors='coerce')).notnull()]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column '{col}':")
        display(non_numeric)
for col in data_vars:
    joined_df[col] = pd.to_numeric(joined_df[col], errors='coerce')

Non-numeric values found in column 'out_freq_s':


Unnamed: 0,GEOID,geometry,apprx_loc,Avg Dur St,avg_cst_st,CES str,Duration buckets,out_freq_s,Utility,Geometry,Latitude (generated),Longitude (generated),Max. extrm_heat,Max. med_income,Max. pct_2xFPL,USCB_GEOID,index_right,distances
2,6037204920,"POLYGON ((-13158279.595 4031441.433, -13158278...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.016993,-118.198616,11,"$39,904",63%,06037204920,,
3,6037205110,"POLYGON ((-13160149.535 4032331.636, -13160128...",Los Angeles,Unknown,Unknown,99,Does not report,Unknown,Non-reporting utility,Polygon,34.024170,-118.213735,11,"$47,219",61%,06037205110,,
4,6037205120,"POLYGON ((-13160215.432 4031563.223, -13160210...",Los Angeles,Unknown,Unknown,100,Does not report,Unknown,Non-reporting utility,Polygon,34.017619,-118.210036,11,"$26,844",82%,06037205120,,
5,6037206010,"POLYGON ((-13162331.530 4037836.246, -13162193...",Los Angeles,Unknown,Unknown,97,Does not report,Unknown,Non-reporting utility,Polygon,34.069886,-118.233296,14,"$42,063",58%,06037206010,,
6,6037206020,"POLYGON ((-13162179.013 4036091.274, -13162176...",Los Angeles,Unknown,Unknown,55,Does not report,Unknown,Non-reporting utility,Polygon,34.056630,-118.232945,11,"$100,809",19%,06037206020,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7959,6025012100,"POLYGON ((-12860034.369 3852807.138, -12860032...",Calexico,Unknown,Unknown,96,Does not report,Unknown,Non-reporting utility,Polygon,32.676122,-115.497183,36,"$22,041",57%,06025012201,593.0,0.0
7959,6025011900,"POLYGON ((-12860034.369 3852807.138, -12860032...",Unincorporated Imperial County area,Unknown,Unknown,84,Does not report,Unknown,Non-reporting utility,Polygon,32.704605,-115.503238,31,"$54,309",42%,06025012201,5352.0,0.0
8894,6037702901,"POLYGON ((-13187282.547 4026266.750, -13187218...",Marina del Rey,Unknown,Unknown,38,Does not report,Unknown,Non-reporting utility,Polygon,33.976831,-118.449626,10,"$118,162",19%,06037702900,3386.0,0.0
8894,6037275602,"POLYGON ((-13187282.547 4026266.750, -13187218...",Los Angeles,Unknown,Unknown,36,Does not report,Unknown,Non-reporting utility,Polygon,33.979471,-118.413498,6,"$148,965",13%,06037702900,1255.0,0.0


In [14]:
data_vars = ['out_freq_s']
# now take the average of the tracts which now exist in the new tract
joined_avg_df = joined_df.groupby(['USCB_GEOID','geometry'])[data_vars].mean().reset_index()
power_shutoff_new_tracts = gpd.GeoDataFrame(joined_avg_df, geometry='geometry')
power_shutoff_new_tracts = power_shutoff_new_tracts.drop(columns={'geometry'})
power_shutoff_new_tracts

Unnamed: 0,USCB_GEOID,out_freq_s
0,06001400100,0.800000
1,06001400200,0.400000
2,06001400300,0.400000
3,06001400400,0.400000
4,06001400500,0.400000
...,...,...
9123,06115040902,1.000000
9124,06115041001,1.500000
9125,06115041002,2.066667
9126,06115041101,2.733333


In [15]:
# one of the newer tracts (GEOID 06075980401; Farallon Islands) is too far away
# from any older ones to reliably interpolate the original data onto, so we fill it with nans
df_fill = pd.DataFrame([['06075980401',np.nan]], columns=['USCB_GEOID','out_freq_s'])
power_shutoff_new_tracts = pd.concat([power_shutoff_new_tracts,df_fill]).sort_values(by="USCB_GEOID")
power_shutoff_new_tracts = power_shutoff_new_tracts.rename(columns={'USCB_GEOID': 'census_tract', 'out_freq_s':'avg_num_psps_event'})
power_shutoff_new_tracts

Unnamed: 0,census_tract,avg_num_psps_event
0,06001400100,0.800000
1,06001400200,0.400000
2,06001400300,0.400000
3,06001400400,0.400000
4,06001400500,0.400000
...,...,...
9123,06115040902,1.000000
9124,06115041001,1.500000
9125,06115041002,2.066667
9126,06115041101,2.733333


In [16]:
nan_checking = pd.isna(power_shutoff_new_tracts['avg_num_psps_event'])

In [17]:
nan_rows = power_shutoff_new_tracts[nan_checking]
print(nan_rows)

     census_tract  avg_num_psps_event
160   06001427100                 NaN
161   06001427200                 NaN
162   06001427300                 NaN
163   06001427600                 NaN
164   06001427700                 NaN
...           ...                 ...
8702  06099003907                 NaN
8703  06099003908                 NaN
8704  06099003909                 NaN
8705  06099004000                 NaN
9054  06111980000                 NaN

[2021 rows x 2 columns]


In [18]:
power_shutoff_new_tracts.to_csv('built_power_shutoffs_metric.csv', index=False)

In [19]:
@append_metadata
def power_shutoff_upload(input_csv, export=False, varname=''):
    '''
    Uploads the calculated Public Safety Power Shutoff (PSPS) metric to S3 bucket. The metric is:
    * average Public Safety Power Shutoff (PSPS) events

    Data for this metric was sourced from PSE Healthy Energy at:
    https://www.psehealthyenergy.org/work/california-public-safety-power-shutoff-interactive-map/ from the 
    PSPS Duration by Census Tract section

    Methods
    -------
    The data was from older census tracts, so we merged it with 2017 California Tiger shape files first.
    The data was then set to Cal-CRAI standardized coordinate reference system (CRS) 4269.
    Data was then spatially joined to the nearest 2021 census tract data.
    Data were then grouped spatially and had the PSPS frequency data averaged per census tracts.
    
    Parameters
    ----------
    input_csv: string
        csv PSPS data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI PSPS metric to AWS
        True = will upload resulting df containing CAL CRAI PSPS metric to AWS

    Script
    ------
    built_power_shutoff.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: source data and destination tracts both reprojected to CRS 3857.')
    print('Data transformation: unchanged tracts isolated to preserve original data.')
    print('Data transformation: new tracts filled by averaging the adjacent original tracts.')
    print('Data transformation: GEOID 06075980401 (Farallon Islands, San Francisco County) filled with nan.') 
    print('Data transformation: original data merged with spatially averaged ("new") data.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [20]:
input_csv = 'built_power_shutoffs_metric.csv'
varname = 'built_pse_power_shutoff'

power_shutoff_upload(input_csv, export=True, varname='test')