## To calculate the number of power plants metric
* merge power plant data and california census data geometries
* got counts of power plants per county and per tract

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

#### New function to pull .gpkg files
* if good, will move to utils or can update existing pull function to handle different file types

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_cec_power_plants.gpkg' locally
Saved GeoPackage as 'built_cec_transmission_lines.gpkg' locally


In [3]:
power_plants = gpd.read_file('built_cec_power_plants.gpkg')

In [4]:
power_plants

Unnamed: 0,OBJECTID,CECPlantID,PlantName,Retired_Pl,OperatorCo,County,Capacity_L,Units,PriEnergyS,StartDate,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1865,S0335,Corcoran 2 Solar LLC CED,0.0,CED California Holdings LLC,Kings,19.8,1,SUN,2015-06-10,...,06031001300,13,Census Tract 13,G5020,S,45574877,0,+36.1197390,-119.5542173,POINT (-119.56789 36.13717)
1,3666,S0332,Cottonwood City of Corcoran LLC,0.0,Onward Energy,Kings,11.0,1,SUN,2015-04-23,...,06031001300,13,Census Tract 13,G5020,S,45574877,0,+36.1197390,-119.5542173,POINT (-119.55303 36.13702)
2,1866,S0520,Corcoran 3 Solar,0.0,CED California Holdings LLC,Kings,20.0,Unit 1,SUN,2016-02-11,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.57971 36.14432)
3,1867,C0007,Hanford - Retired October 2011,1.0,Hanford LP,Kings,24.0,GEN 1,PC,1990-09-01,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.64839 36.26964)
4,1868,G0832,Hanford Energy Park Peaker,0.0,"MRP San Joaquin Energy, LLC",Kings,92.0,"1, 2",NG,2001-09-01,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.64744 36.27031)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,3645,S0223,West Gates Solar Station,0.0,Pacific Gas & Electric (PG&E),Fresno,10.0,1,SUN,2013-06-24,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.13288 36.14337)
1777,3646,S0224,Gates Solar Station,0.0,Pacific Gas & Electric (PG&E),Fresno,20.0,1,SUN,2013-06-24,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.11174 36.17892)
1778,3648,S9192,Westlands Solar Farms,0.0,Clenera - Renewable Energy,Fresno,18.0,W3963,SUN,2014-02-14,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.07530 36.13900)
1779,3649,S0307,Orion 2 Solar,0.0,Longroad Energy,Kings,8.0,1,SUN,2014-06-01,...,06031980000,9800,Census Tract 9800,G5020,S,70171765,189884,+36.3051812,-119.9245603,POINT (-119.90414 36.25611)


### Filtering to relevant columns

In [5]:
filtered_power_plants = power_plants[['PlantName', 'CECPlantID', 'County','geometry']]

# Establish columns to check for duplicates
columns_to_check = ['CECPlantID']

# Find duplicate rows based on specified columns
duplicate_mask = filtered_power_plants.duplicated(subset=columns_to_check, keep='first')

# Filter rows based on condition on the numeric column
cleaned_power_plants = filtered_power_plants[~(duplicate_mask)]

cleaned_power_plants

Unnamed: 0,PlantName,CECPlantID,County,geometry
0,Corcoran 2 Solar LLC CED,S0335,Kings,POINT (-119.56789 36.13717)
1,Cottonwood City of Corcoran LLC,S0332,Kings,POINT (-119.55303 36.13702)
2,Corcoran 3 Solar,S0520,Kings,POINT (-119.57971 36.14432)
3,Hanford - Retired October 2011,C0007,Kings,POINT (-119.64839 36.26964)
4,Hanford Energy Park Peaker,G0832,Kings,POINT (-119.64744 36.27031)
...,...,...,...,...
1776,West Gates Solar Station,S0223,Fresno,POINT (-120.13288 36.14337)
1777,Gates Solar Station,S0224,Fresno,POINT (-120.11174 36.17892)
1778,Westlands Solar Farms,S9192,Fresno,POINT (-120.07530 36.13900)
1779,Orion 2 Solar,S0307,Kings,POINT (-119.90414 36.25611)


Below counts the number of entries per selected county. This is used to check that counties have the same number of power plants after the spatial join done next. The numbers should match as we have done all duplicate cleaning above

In [6]:
# Filter the DataFrame to include only rows where 'CECPlantID' column exists
checkin = cleaned_power_plants.dropna(subset=['CECPlantID'])
county_count = checkin['County'] == 'Kern'
print('Number of entries of designated county:',len(county_count[county_count]))

Number of entries of designated county: 220


### Pull tract data and isolate relevant columns

In [7]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [8]:
# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries


Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


### Spatial join the power plant data with the ca boundaries so we can attribute counties and tracts to power plants
* the result has an extra thousand rows due to duplicate tract/geometries, this is addressed when grouping by tract

In [9]:
ca_power_plants = gpd.sjoin(cleaned_power_plants, filtered_ca_boundaries, how='right', predicate='within')
ca_power_plants

Unnamed: 0,index_left,PlantName,CECPlantID,County,tract,geometry
0,,,,,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,,,,,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,,,,,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,,,,,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,,,,,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...,...,...,...,...
9124,,,,,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,1105.0,Coyote Creek,H0119,Orange,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,,,,,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,,,,,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


Another checker to count how many of a given tract show up in the dataset (that also have a CECPlantID) before we merge them and get the final counts

In [10]:
# Filter the DataFrame to include only rows where 'CECPlantID' column exists
filtered_df = ca_power_plants.dropna(subset=['CECPlantID'])
tract_check = filtered_df['tract'] == '6085505010'
print('Number of entries of designated tract:',len(tract_check[tract_check]))

Number of entries of designated tract: 3


Grouping by tract as long as CECPlantID is not NaN and summing those so we have number of plants per tract

In [11]:
# Group by 'tract' and count the number of entries where 'CECPlantID' exists
tract_power_counts = ca_power_plants.groupby('tract')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='Power_Plant_Count')

# Output the result
tract_power_counts

Unnamed: 0,tract,Power_Plant_Count
0,6001400100,0
1,6001400200,0
2,6001400300,0
3,6001400400,0
4,6001400500,0
...,...,...
9124,6115040902,0
9125,6115041001,0
9126,6115041002,1
9127,6115041101,2


In [16]:
tract_power_counts['Power_Plant_Count'].min(), tract_power_counts['Power_Plant_Count'].max()

(0, 51)

### Now that we have merged, we can check the df's counts per given tract and see if it matches with our checker above

In [13]:
tract_value = '6085505010'
filtered_entries = tract_power_counts[tract_power_counts['tract'] == tract_value]
print(filtered_entries)

           tract  Power_Plant_Count
8004  6085505010                  3


In [14]:
# Group by 'tract' and count the number of entries where 'CECPlantID' exists
county_power_counts = ca_power_plants.groupby('County')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='Power_Plant_Count')
county_power_counts = county_power_counts[1:]

# Output the result
county_power_counts

Unnamed: 0,County,Power_Plant_Count
1,Alameda,45
2,Amador,10
3,Butte,28
4,Calaveras,6
5,Colusa,2
6,Contra Costa,39
7,El Dorado,12
8,Fresno,83
9,Glenn,3
10,Humboldt,9


### Function call

In [12]:
@append_metadata
def calc_power_plant(df, export=False, export_filename=None, varname = ''):
    '''
    Calculates the number of power plants per California tract and county. 
    Data is sourced from the California Energy Commission (CEC): 
    https://cecgis-caenergy.opendata.arcgis.com/datasets/CAEnergy::california-power-plants/about
    
    Methods
    -------
    Geometry columns were merged between California 2021 tiger census tract data 
    and CEC power plant data to attribute power plants to census tracts. 
    Duplicate entries were removed based on matching CECPlantID. 

    Script
    ------
    built_power_plant.ipynb
    
    Parameters
    ----------
    df: string
        the dataframe containing the power plant data
    export: True/False boolian
        False = will not upload resulting df containing the power plant metric to AWS
        True = will upload resulting df containing the power plant metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''   
    print('Data transformation: isolate relevant columns for metric calculation.')
    print('Data transformation: check for duplicate plant IDs using "CECPlantID" and retain single count for identical plants.')
    filtered_power_plants = df[['PlantName', 'CECPlantID', 'County','geometry']]

    # Establish columns to check for duplicates
    columns_to_check = ['CECPlantID']

    # Find duplicate rows based on specified columns
    duplicate_mask = filtered_power_plants.duplicated(subset=columns_to_check, keep='first')

    # Filter rows based on condition on the numeric column
    cleaned_power_plants = filtered_power_plants[~(duplicate_mask)]
    
    # read in CA census tiger file
    census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
    ca_boundaries = gpd.read_file(census_shp_dir)

    # Create a copy of the relevant columns from ca_boundaries
    filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()  # Added parentheses to the copy method

    # Rename the 'GEOID' column to 'tract'
    filtered_ca_boundaries = filtered_ca_boundaries.rename(columns={'GEOID': 'tract'})  # Corrected the rename method call

    # Remove the first character from the 'tract' column
    filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]
    print('Data transformation: merge geometries with California tiger census tract data (2021).')

    ca_power_plants = gpd.sjoin(cleaned_power_plants, filtered_ca_boundaries, how='right', predicate='within')

    print('Data transformation: make new dataframe by grouping data by census tracts and sum multiple entries')
    # Group by 'tract' and count the number of entries where 'CECPlantID' exists
    tract_power_counts = ca_power_plants.groupby('tract')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='power_plant_count')

    # export to csv and upload to AWS
    if export == True:
        tract_power_counts.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        os.remove('built_cec_power_plants.gpkg') # remove from local to clear up directory
        os.remove(export_filename[0])

    return tract_power_counts # returns df

In [13]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'
pull_gpkg_from_directory(bucket_name, aws_dir)

power_plants = gpd.read_file('built_cec_power_plants.gpkg')

calc_power_plant(power_plants, export=True, export_filename = 'built_power_plant_metric.csv',
                 varname = 'built_cec_power_plants')

Saved GeoPackage as 'built_cec_power_plants.gpkg' locally
Saved GeoPackage as 'built_cec_transmission_lines.gpkg' locally


Unnamed: 0,tract,power_plant_count
0,6001400100,0
1,6001400200,0
2,6001400300,0
3,6001400400,0
4,6001400500,0
...,...,...
9124,6115040902,0
9125,6115041001,0
9126,6115041002,1
9127,6115041101,2
