## To calculate the number of power plants metric
* was able to merge power plant data and california census data geometries
* got counts of power plants per county and per tract

In [7]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

## New function to pull .gpkg files
* if good, will move to utils or can update existing pull function to handle different file types

In [4]:
def pull_gpkg_from_directory(bucket_name, directory):
    """
    Pulls GeoPackage files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for GeoPackage files.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .gpkg file
            if key.endswith('.gpkg'):
                # Download the GeoPackage file into memory
                gpkg_object = s3.get_object(Bucket=bucket_name, Key=key)
                gpkg_data = io.BytesIO(gpkg_object['Body'].read())
                
                # Save the GeoPackage file locally
                gpkg_filename = os.path.basename(key)
                with open(gpkg_filename, 'wb') as gpkg_file:
                    gpkg_file.write(gpkg_data.getvalue())
                
                print(f"Saved GeoPackage as '{gpkg_filename}' locally")
                # You can now use the saved file for further processing
    else:
        print("No objects found in the specified directory.")


In [23]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_cec_power_plants.gpkg' locally
Saved GeoPackage as 'built_cec_transmission_lines.gpkg' locally


In [24]:
power_plants = gpd.read_file('built_cec_power_plants.gpkg')


In [25]:
power_plants

Unnamed: 0,OBJECTID,CECPlantID,PlantName,Retired_Pl,OperatorCo,County,Capacity_L,Units,PriEnergyS,StartDate,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1865,S0335,Corcoran 2 Solar LLC CED,0.0,CED California Holdings LLC,Kings,19.8,1,SUN,2015-06-10,...,06031001300,13,Census Tract 13,G5020,S,45574877,0,+36.1197390,-119.5542173,POINT (-119.56789 36.13717)
1,3666,S0332,Cottonwood City of Corcoran LLC,0.0,Onward Energy,Kings,11.0,1,SUN,2015-04-23,...,06031001300,13,Census Tract 13,G5020,S,45574877,0,+36.1197390,-119.5542173,POINT (-119.55303 36.13702)
2,1866,S0520,Corcoran 3 Solar,0.0,CED California Holdings LLC,Kings,20.0,Unit 1,SUN,2016-02-11,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.57971 36.14432)
3,1867,C0007,Hanford - Retired October 2011,1.0,Hanford LP,Kings,24.0,GEN 1,PC,1990-09-01,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.64839 36.26964)
4,1868,G0832,Hanford Energy Park Peaker,0.0,"MRP San Joaquin Energy, LLC",Kings,92.0,"1, 2",NG,2001-09-01,...,06031001200,12,Census Tract 12,G5020,S,397721536,189281,+36.2324904,-119.6239493,POINT (-119.64744 36.27031)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,3645,S0223,West Gates Solar Station,0.0,Pacific Gas & Electric (PG&E),Fresno,10.0,1,SUN,2013-06-24,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.13288 36.14337)
1777,3646,S0224,Gates Solar Station,0.0,Pacific Gas & Electric (PG&E),Fresno,20.0,1,SUN,2013-06-24,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.11174 36.17892)
1778,3648,S9192,Westlands Solar Farms,0.0,Clenera - Renewable Energy,Fresno,18.0,W3963,SUN,2014-02-14,...,06019007801,78.01,Census Tract 78.01,G5020,S,293191128,0,+36.1460004,-120.0943186,POINT (-120.07530 36.13900)
1779,3649,S0307,Orion 2 Solar,0.0,Longroad Energy,Kings,8.0,1,SUN,2014-06-01,...,06031980000,9800,Census Tract 9800,G5020,S,70171765,189884,+36.3051812,-119.9245603,POINT (-119.90414 36.25611)


## Filtering out retired power plants and isolating to necessary columns

In [28]:
filtered_power_plants = power_plants[power_plants['Retired_Pl'] == 0.0]
filtered_power_plants = filtered_power_plants[['PlantName', 'County', 'Capacity_L','geometry']]
filtered_power_plants

Unnamed: 0,PlantName,County,Capacity_L,geometry
0,Corcoran 2 Solar LLC CED,Kings,19.8,POINT (-119.56789 36.13717)
1,Cottonwood City of Corcoran LLC,Kings,11.0,POINT (-119.55303 36.13702)
2,Corcoran 3 Solar,Kings,20.0,POINT (-119.57971 36.14432)
4,Hanford Energy Park Peaker,Kings,92.0,POINT (-119.64744 36.27031)
5,Guernsey Solar Station,Kings,20.0,POINT (-119.65047 36.16428)
...,...,...,...,...
1776,West Gates Solar Station,Fresno,10.0,POINT (-120.13288 36.14337)
1777,Gates Solar Station,Fresno,20.0,POINT (-120.11174 36.17892)
1778,Westlands Solar Farms,Fresno,18.0,POINT (-120.07530 36.13900)
1779,Orion 2 Solar,Kings,8.0,POINT (-119.90414 36.25611)


## Pull tract data and isolate relevant columns

In [10]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [41]:
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']]
filtered_ca_boundaries.rename(columns ={'GEOID':'tract'}, inplace=True)
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]
filtered_ca_boundaries

Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


# Spatial join the power plant data with the ca boundaries so we can attribute counties and tracts to power plants

In [45]:
ca_power_plants = gpd.sjoin(filtered_power_plants, filtered_ca_boundaries, how='inner', predicate='within')
ca_power_plants

Unnamed: 0,PlantName,County,Capacity_L,geometry,index_right,tract
0,Corcoran 2 Solar LLC CED,Kings,19.8,POINT (-119.56789 36.13717),3789,6031001300
1,Cottonwood City of Corcoran LLC,Kings,11.0,POINT (-119.55303 36.13702),3789,6031001300
2,Corcoran 3 Solar,Kings,20.0,POINT (-119.57971 36.14432),3793,6031001200
4,Hanford Energy Park Peaker,Kings,92.0,POINT (-119.64744 36.27031),3793,6031001200
5,Guernsey Solar Station,Kings,20.0,POINT (-119.65047 36.16428),3793,6031001200
...,...,...,...,...,...,...
1776,West Gates Solar Station,Fresno,10.0,POINT (-120.13288 36.14337),2140,6019007801
1777,Gates Solar Station,Fresno,20.0,POINT (-120.11174 36.17892),2140,6019007801
1778,Westlands Solar Farms,Fresno,18.0,POINT (-120.07530 36.13900),2140,6019007801
1779,Orion 2 Solar,Kings,8.0,POINT (-119.90414 36.25611),6760,6031980000


## Noticed a few duplicate rows, selecting a few columns to check for duplicates

In [65]:
# Establish columns to check for duplicates
columns_to_check = ['County', 'PlantName']

# If rows are identical in county and plant name, one last check with the capacity
# if within 3L, the duplicate will be dropped
capacity_threshold = 3.0

# Find duplicate rows based on specified columns
duplicate_mask = ca_power_plants.duplicated(subset=columns_to_check, keep='first')

# Filter rows based on condition on the numeric column
cleaned_power_plants = ca_power_plants[~(duplicate_mask & (ca_power_plants['Capacity_L'].abs() <= capacity_threshold))]

cleaned_power_plants

Unnamed: 0,PlantName,County,Capacity_L,geometry,index_right,tract
0,Corcoran 2 Solar LLC CED,Kings,19.8,POINT (-119.56789 36.13717),3789,6031001300
1,Cottonwood City of Corcoran LLC,Kings,11.0,POINT (-119.55303 36.13702),3789,6031001300
2,Corcoran 3 Solar,Kings,20.0,POINT (-119.57971 36.14432),3793,6031001200
4,Hanford Energy Park Peaker,Kings,92.0,POINT (-119.64744 36.27031),3793,6031001200
5,Guernsey Solar Station,Kings,20.0,POINT (-119.65047 36.16428),3793,6031001200
...,...,...,...,...,...,...
1776,West Gates Solar Station,Fresno,10.0,POINT (-120.13288 36.14337),2140,6019007801
1777,Gates Solar Station,Fresno,20.0,POINT (-120.11174 36.17892),2140,6019007801
1778,Westlands Solar Farms,Fresno,18.0,POINT (-120.07530 36.13900),2140,6019007801
1779,Orion 2 Solar,Kings,8.0,POINT (-119.90414 36.25611),6760,6031980000


## Count number of power plants per county and tract

In [92]:
# Count the number of entries per county
county_power_counts = cleaned_power_plants.groupby('County').size().reset_index(name='Power_Plant_Count')

# Count the number of entries per tract
tract_power_counts = cleaned_power_plants.groupby('tract').size().reset_index(name='Power_Plant_Count')

In [101]:
# Eliminate non-CA county entry 'ALL'
filtered_county_power_counts = county_power_counts[1:]
print(len(filtered_county_power_counts))
filtered_county_power_counts.head()

55


Unnamed: 0,County,Power_Plant_Count
1,Alameda,29
2,Amador,7
3,Butte,26
4,Calaveras,6
5,Colusa,2


In [77]:
print(len(tract_power_counts))
tract_power_counts.head()

695


Unnamed: 0,tract,Power_Plant_Count
0,6001401700,2
1,6001402200,1
2,6001408100,1
3,6001428700,1
4,6001433104,1


## Checking results for Santa Clara

In [78]:
santa_clara_power = cleaned_power_plants[cleaned_power_plants['County'] == 'Santa Clara']
print(len(santa_clara_power))
santa_clara_power

32


Unnamed: 0,PlantName,County,Capacity_L,geometry,index_right,tract
140,Gilroy Cogen Plant,Santa Clara,123.4,POINT (-121.53667 37.00017),7687,6085512602
141,Gilroy Peaker,Santa Clara,141.9,POINT (-121.53618 36.99915),7687,6085512602
172,Metcalf Energy Center,Santa Clara,565.8,POINT (-121.74599 37.22076),7745,6085512100
173,Unified School District Morgan Hill,Santa Clara,1.0,POINT (-121.63538 37.14907),7691,6085512308
192,Foothill - De Anza Community College District,Santa Clara,1.6,POINT (-122.12474 37.36314),5334,6085511704
193,Foothill College - Pv Capstone,Santa Clara,1.7,POINT (-122.13029 37.36369),5334,6085511704
195,San Jose-Santa Clara Regional Wastewater Facility,Santa Clara,25.8,POINT (-121.94301 37.43408),7624,6085504602
196,Network Appliance/Chevron Energy Solutions,Santa Clara,1.0,POINT (-122.02611 37.41514),7624,6085504602
197,Network Appliance,Santa Clara,1.4,POINT (-122.02616 37.41595),7624,6085504602
198,City of Sunnyvale Wastewater Treatment Plant,Santa Clara,1.6,POINT (-122.03492 37.42261),7624,6085504602


## Drafting function call while we discuss methods

In [None]:
#@append_metadata
def calc_power_plant(df, export=False, export_filename=None, varname = ''):
    '''
    Calculates the number of power plants per California tract and county. Data is sourced from the California Energy Commission (CEC): https://cecgis-caenergy.opendata.arcgis.com/datasets/CAEnergy::california-power-plants/about
    
    Methods
    -------
    Geometry columns were merged between California 2021 tiger census tract data and CEC power plant data to attribute power plants to census tracts. Duplicate entries were removed (duplicates established on matching county, plant name, and similar capacities (within 3 units)). 

    Script
    ------
    built_power_plant.ipynb
    
    Parameters
    ----------
    df: string
        the dataframe containing the power plant data
    export: True/False boolian
        False = will not upload resulting df containing the power plant metric to AWS
        True = will upload resulting df containing the power plant metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''   
    print('Data transformation: eliminate retired power plants.')
    filtered_power_plants = df[df['Retired_Pl'] == 0.0]

    print('Data transformation: isolate relevant columns for metric calculation')
    filtered_power_plants = filtered_power_plants[['PlantName', 'County', 'Capacity_L','geometry']]
    
    # read in CA census tiger file
    census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
    ca_boundaries = gpd.read_file(census_shp_dir)

    filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']]
    filtered_ca_boundaries.rename(columns ={'GEOID':'tract'}, inplace=True)
    filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

    print('Data transformation: merge geometries with California tiger census tract data (2021)')
    ca_power_plants = gpd.sjoin(filtered_power_plants, filtered_ca_boundaries, how='inner', predicate='within')

    # Establish columns to check for duplicates
    columns_to_check = ['County', 'PlantName']

    # If rows are identical in county and plant name, one last check with the capacity
    # if within 3L, the duplicate will be dropped
    capacity_threshold = 3.0

    print('Data transformation: mask duplicate rows based on county, plant name, and capacity')
    # Find duplicate rows based on specified columns
    duplicate_mask = ca_power_plants.duplicated(subset=columns_to_check, keep='first')

    # Filter rows based on condition on the numeric column
    cleaned_power_plants = ca_power_plants[~(duplicate_mask & (ca_power_plants['Capacity_L'].abs() <= capacity_threshold))]

    # Count the number of entries per county
    county_power_counts = cleaned_power_plants.groupby('County').size().reset_index(name='Power_Plant_Count')

    # Count the number of entries per tract
    tract_power_counts = cleaned_power_plants.groupby('tract').size().reset_index(name='Power_Plant_Count')
    
    print('Data transformation: eliminate any non-CA county entries')

    # Eliminate non-CA county entry 'ALL'
    filtered_county_power_counts = county_power_counts[1:]

    # export to csv and upload to AWS
    if export == True:
        tract_power_counts.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        os.remove('built_cec_power_plants.gpkg') # remove from local to clear up directory
        os.remove(export_filename[0])

    return tract_power_counts # returns df
    #return filtered_county_power_counts

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'
pull_gpkg_from_directory(bucket_name, aws_dir)

power_plants = gpd.read_file('built_cec_power_plants.gpkg')

calc_power_plant(power_plants, export=True, export_filename = 'built_power_plant_metric.csv',
                    varname = 'built_cec_power_plants')