## To calculate the number of power plants metric
* was able to merge power plant data and california census data geometries
* got counts of power plants per county and per tract

In [4]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

## New function to pull .gpkg files
* if good, will move to utils or can update existing pull function to handle different file types

In [2]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_cec_power_plants.gpkg' locally
Saved GeoPackage as 'built_cec_transmission_lines.gpkg' locally


In [None]:
power_plants = gpd.read_file('built_cec_power_plants.gpkg')

In [None]:
power_plants

## Filtering to relevant columns

In [None]:
filtered_power_plants = power_plants[['PlantName', 'CECPlantID', 'County','geometry']]

# Establish columns to check for duplicates
columns_to_check = ['CECPlantID']

# Find duplicate rows based on specified columns
duplicate_mask = filtered_power_plants.duplicated(subset=columns_to_check, keep='first')

# Filter rows based on condition on the numeric column
cleaned_power_plants = filtered_power_plants[~(duplicate_mask)]

cleaned_power_plants

### Below counts the number of entries per selected county. This is used to check that counties have the same number of power plants after the spatial join done next. The numbers should match as we have done all duplicate cleaning above

In [None]:
# Filter the DataFrame to include only rows where 'CECPlantID' column exists
checkin = cleaned_power_plants.dropna(subset=['CECPlantID'])
county_count = checkin['County'] == 'Kern'
print('Number of entries of designated county:',len(county_count[county_count]))

## Pull tract data and isolate relevant columns

In [None]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

In [None]:
# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries


# Spatial join the power plant data with the ca boundaries so we can attribute counties and tracts to power plants
* the result has an extra thousand rows due to duplicate tract/geometries, this is addressed when grouping by tract

In [None]:
ca_power_plants = gpd.sjoin(cleaned_power_plants, filtered_ca_boundaries, how='right', predicate='within')
ca_power_plants

### Another checker to count how many of a given tract show up in the dataset (that also have a CECPlantID) before we merge them and get the final counts

In [None]:
# Filter the DataFrame to include only rows where 'CECPlantID' column exists
filtered_df = ca_power_plants.dropna(subset=['CECPlantID'])
tract_check = filtered_df['tract'] == '6085505010'
print('Number of entries of designated tract:',len(tract_check[tract_check]))

### Grouping by tract as long as CECPlantID is not NaN and summing those so we have number of plants per tract

In [None]:
# Group by 'tract' and count the number of entries where 'CECPlantID' exists
tract_power_counts = ca_power_plants.groupby('tract')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='Power_Plant_Count')

# Output the result
tract_power_counts

### Now that we have merged, we can check the df's counts per given tract and see if it matches with our checker above

In [None]:
tract_value = '6085505010'
filtered_entries = tract_power_counts[tract_power_counts['tract'] == tract_value]
print(filtered_entries)

In [None]:
# Group by 'tract' and count the number of entries where 'CECPlantID' exists
county_power_counts = ca_power_plants.groupby('County')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='Power_Plant_Count')
county_power_counts = county_power_counts[1:]

# Output the result
county_power_counts

## Drafting function call while we discuss methods

In [12]:
@append_metadata
def calc_power_plant(df, export=False, export_filename=None, varname = ''):
    '''
    Calculates the number of power plants per California tract and county. Data is sourced from the California Energy Commission (CEC): https://cecgis-caenergy.opendata.arcgis.com/datasets/CAEnergy::california-power-plants/about
    
    Methods
    -------
    Geometry columns were merged between California 2021 tiger census tract data and CEC power plant data to attribute power plants to census tracts. Duplicate entries were removed based on matching CECPlantID. 

    Script
    ------
    built_power_plant.ipynb
    
    Parameters
    ----------
    df: string
        the dataframe containing the power plant data
    export: True/False boolian
        False = will not upload resulting df containing the power plant metric to AWS
        True = will upload resulting df containing the power plant metric to AWS
    export_filename: string
        name of the csv file to be uploaded to AWS

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''   
    print('Data transformation: isolate relevant columns for metric calculation')
    print('Data transformation: check for duplicate plant IDs and retain just one')

    filtered_power_plants = df[['PlantName', 'CECPlantID', 'County','geometry']]

    # Establish columns to check for duplicates
    columns_to_check = ['CECPlantID']

    # Find duplicate rows based on specified columns
    duplicate_mask = filtered_power_plants.duplicated(subset=columns_to_check, keep='first')

    # Filter rows based on condition on the numeric column
    cleaned_power_plants = filtered_power_plants[~(duplicate_mask)]
    
    # read in CA census tiger file
    census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
    ca_boundaries = gpd.read_file(census_shp_dir)

    # Create a copy of the relevant columns from ca_boundaries
    filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()  # Added parentheses to the copy method

    # Rename the 'GEOID' column to 'tract'
    filtered_ca_boundaries = filtered_ca_boundaries.rename(columns={'GEOID': 'tract'})  # Corrected the rename method call

    # Remove the first character from the 'tract' column
    filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

    print('Data transformation: merge geometries with California tiger census tract data (2021)')

    ca_power_plants = gpd.sjoin(cleaned_power_plants, filtered_ca_boundaries, how='right', predicate='within')

    print('Data transformation: make new dataframe by grouping data by census tracts and sum multiple entries')
    # Group by 'tract' and count the number of entries where 'CECPlantID' exists
    tract_power_counts = ca_power_plants.groupby('tract')['CECPlantID'].apply(lambda x: x.notnull().sum()).reset_index(name='power_plant_count')

    # export to csv and upload to AWS
    if export == True:
        tract_power_counts.to_csv(export_filename)
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [export_filename]
        upload_csv_aws(export_filename, bucket_name, directory)

        os.remove('built_cec_power_plants.gpkg') # remove from local to clear up directory
        os.remove(export_filename[0])

    return tract_power_counts # returns df

In [13]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/utilities/ca_energy_commission/'
pull_gpkg_from_directory(bucket_name, aws_dir)

power_plants = gpd.read_file('built_cec_power_plants.gpkg')

calc_power_plant(power_plants, export=True, export_filename = 'built_power_plant_metric.csv',
                 varname = 'built_cec_power_plants')

Saved GeoPackage as 'built_cec_power_plants.gpkg' locally
Saved GeoPackage as 'built_cec_transmission_lines.gpkg' locally


Unnamed: 0,tract,power_plant_count
0,6001400100,0
1,6001400200,0
2,6001400300,0
3,6001400400,0
4,6001400500,0
...,...,...
9124,6115040902,0
9125,6115041001,0
9126,6115041002,1
9127,6115041101,2
