In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws
from scripts.utils.write_metadata import append_metadata

# Once either this notebook or the power plant notebook are merged to main, this function can be called

In [2]:
def pull_gpkg_from_directory(bucket_name, directory):
    """
    Pulls GeoPackage files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for GeoPackage files.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .gpkg file
            if key.endswith('.gpkg'):
                # Download the GeoPackage file into memory
                gpkg_object = s3.get_object(Bucket=bucket_name, Key=key)
                gpkg_data = io.BytesIO(gpkg_object['Body'].read())
                
                # Save the GeoPackage file locally
                gpkg_filename = os.path.basename(key)
                with open(gpkg_filename, 'wb') as gpkg_file:
                    gpkg_file.write(gpkg_data.getvalue())
                
                print(f"Saved GeoPackage as '{gpkg_filename}' locally")
                # You can now use the saved file for further processing
    else:
        print("No objects found in the specified directory.")

In [3]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/communication_infrastructure/homeland_infrastructure_foundation_level_data/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_hifld_cellular_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_microwave_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_mobile_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_paging_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_radio_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_tv_contour.gpkg' locally


In [5]:
cellular_towers_data = gpd.read_file('built_hifld_cellular_towers.gpkg')
print('complete')
microwave_towers_data = gpd.read_file('built_hifld_microwave_towers.gpkg')
print('complete')
mobile_towers_data = gpd.read_file('built_hifld_mobile_towers.gpkg')
print('complete')
paging_towers_data = gpd.read_file('built_hifld_paging_towers.gpkg')
print('complete')
radio_towers_data = gpd.read_file('built_hifld_radio_towers.gpkg')
print('complete')

complete
complete
complete
complete
complete


In [127]:
mobile_towers_data

Unnamed: 0,OBJECTID,UniqSysID,Licensee,Callsign,LocNum,LatDeg,LatMin,LatSec,LatDir,LonDeg,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,9,1100969,SCRIPPS BROADCASTING HOLDINGS LLC,BLP00316,1,32.0,43.0,11.2,N,117.0,...,06073003401,34.01,Census Tract 34.01,G5020,S,2805035,0,+32.7250255,-117.0950446,POINT (-117.09836 32.71978)
1,10,1100969,"Scripps Media, Inc.",BLP00316,1,32.0,43.0,11.2,N,117.0,...,06073003401,34.01,Census Tract 34.01,G5020,S,2805035,0,+32.7250255,-117.0950446,POINT (-117.09836 32.71978)
2,627,1101893,SCRIPPS BROADCASTING HOLDINGS LLC,BLP01481,1,32.0,43.0,11.2,N,117.0,...,06073003401,34.01,Census Tract 34.01,G5020,S,2805035,0,+32.7250255,-117.0950446,POINT (-117.09836 32.71978)
3,628,1101893,"Scripps Media, Inc.",BLP01481,1,32.0,43.0,11.2,N,117.0,...,06073003401,34.01,Census Tract 34.01,G5020,S,2805035,0,+32.7250255,-117.0950446,POINT (-117.09836 32.71978)
4,5694,1112464,SCRIPPS BROADCASTING HOLDINGS LLC,KMM789,3,32.0,43.0,11.2,N,117.0,...,06073003401,34.01,Census Tract 34.01,G5020,S,2805035,0,+32.7250255,-117.0950446,POINT (-117.09836 32.71978)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,15784,4269322,,WRFP220,28,33.0,51.0,51.7,N,118.0,...,06037543321,5433.21,Census Tract 5433.21,G5020,S,4253794,0,+33.8648872,-118.2521104,POINT (-118.26114 33.86436)
1616,15755,4269322,"Stuckey, Cameron",WRFP220,32,33.0,57.0,12.4,N,118.0,...,06037600702,6007.02,Census Tract 6007.02,G5020,S,2387148,43367,+33.9497147,-118.3335012,POINT (-118.33919 33.95345)
1617,15788,4269322,,WRFP220,32,33.0,57.0,12.4,N,118.0,...,06037600702,6007.02,Census Tract 6007.02,G5020,S,2387148,43367,+33.9497147,-118.3335012,POINT (-118.33919 33.95345)
1618,15762,4269322,"Stuckey, Cameron",WRFP220,24,37.0,45.0,5.7,N,122.0,...,06001409000,4090,Census Tract 4090,G5020,S,16913353,4389442,+37.7299594,-122.2164979,POINT (-122.20053 37.75158)


In [124]:
cellular_towers_data.columns

Index(['OBJECTID', 'UniqSysID', 'Licensee', 'Callsign', 'LocNum', 'LatDeg',
       'LatMin', 'LatSec', 'LatDir', 'LonDeg', 'LonMin', 'LonSec', 'LonDir',
       'LocAdd', 'LocCity', 'LocCounty', 'LocState', 'Nepa', 'QZone', 'TowReg',
       'SupStruc', 'AllStruc', 'StrucType', 'LicStatus', 'latdec', 'londec',
       'url', 'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID',
       'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT',
       'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON',
       'geometry'],
      dtype='object')

In [133]:
radio_towers_data.columns

Index(['OBJECTID', 'CALLSIGN', 'FREQUENCY', 'SERVICE', 'CLASS', 'STATUS',
       'CITY', 'STATE', 'COUNTRY', 'FILENUM', 'FACID', 'LAT', 'LON',
       'LICENSEE', 'LATDD', 'LONDD', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

bigger file, running later

In [None]:
tv_contour_data = gpd.read_file('built_hifld_tv_contour.gpkg')

In [134]:
# Read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Select columns and rename
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Modify 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Output the modified GeoDataFrame
filtered_ca_boundaries

Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


In [143]:
communication_infrastructure_data = [cellular_towers_data, microwave_towers_data, mobile_towers_data, paging_towers_data, radio_towers_data] 

# List of words to search for in column names
words_to_search = ['OBJECTID','UniqSysID','Licensee', 'CALLSIGN', 'LocCounty', 'CITY', 'AllStruc', 'StrucType', 'LicStatus', 'STATUS', 'FREQUENCY', 'geometry']

def filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search):
    filtered_dfs = {}

    for df_name, df in enumerate(communication_infrastructure_data):
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df_name = f"filtered_{df_name}"

        joined_df = gpd.sjoin(filtered_df, filtered_ca_boundaries, how='inner', predicate='intersects')
        joined_df_name = f"joined_{filtered_df_name}"

        filtered_dfs[joined_df_name] = joined_df

    return filtered_dfs

ca_boundaries = gpd.read_file(census_shp_dir)
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

filtered_and_joined_dfs = filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search)


In [152]:
def filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search, columns_to_check):
    filtered_dfs = {}

    for df_name, df in enumerate(communication_infrastructure_data):
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df_name = f"filtered_{df_name}"

        joined_df = gpd.sjoin(filtered_df, filtered_ca_boundaries, how='inner', predicate='intersects')
        joined_df_name = f"joined_{filtered_df_name}"

        if all(col in joined_df.columns for col in columns_to_check):
            # Find duplicate rows based on specified columns
            duplicate_mask = joined_df.duplicated(subset=columns_to_check, keep='first')

            # Filter rows based on condition on the numeric column (geometry in this case)
            cleaned_df = joined_df[~duplicate_mask]
        else:
            # If the column is not present in the DataFrame, use the original DataFrame
            cleaned_df = joined_df

        filtered_dfs[joined_df_name] = cleaned_df

    return filtered_dfs

# Usage example:
ca_boundaries = gpd.read_file(census_shp_dir)
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

columns_to_check = ['UniqSysID']  # Specify the column to check for duplicates

filtered_and_joined_dfs = filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search, columns_to_check)


In [155]:
cellular_tower_filtered = filtered_and_joined_dfs['joined_filtered_0']
microwave_tower_filtered = filtered_and_joined_dfs['joined_filtered_1']
mobile_tower_filtered = filtered_and_joined_dfs['joined_filtered_2']
paging_tower_filtered = filtered_and_joined_dfs['joined_filtered_3']
radio_tower_filtered = filtered_and_joined_dfs['joined_filtered_4']
filtered_communication_infrastructure_data = [cellular_tower_filtered, microwave_tower_filtered, mobile_tower_filtered, paging_tower_filtered, radio_tower_filtered] 


In [156]:
mobile_tower_filtered

Unnamed: 0,OBJECTID,UniqSysID,Licensee,LocCounty,AllStruc,StrucType,LicStatus,geometry,index_right,tract
0,9,1100969,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
2,627,1101893,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
4,5694,1112464,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
8,6390,1118850,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,21.0,NNTANN,A,POINT (-117.09836 32.71978),2892,6073003401
16,13807,2322197,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
...,...,...,...,...,...,...,...,...,...,...
1595,15267,3311789,"KFTV LICENSE PARTNERSHIP, G.P.",FRESNO,,,A,POINT (-119.80375 36.83617),3999,6019004503
1597,15277,3311794,"KDTV LICENSE PARTNERSHIP, G.P.",SAN FRANCISCO,,,A,POINT (-122.39722 37.79053),3238,6075061501
1604,15696,3956006,"CAPSTAR TX, LLC",LOS ANGELES,,,A,POINT (-118.19450 33.95031),759,6037536000
1606,15698,3978631,"Television City Productions, LLC",LOS ANGELES,,,A,POINT (-118.35889 34.07500),4353,6037214501


In [147]:
mobile_tower_filtered

Unnamed: 0,OBJECTID,UniqSysID,Licensee,LocCounty,AllStruc,StrucType,LicStatus,geometry,index_right,tract
0,9,1100969,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
1,10,1100969,"Scripps Media, Inc.",SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
2,627,1101893,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
3,628,1101893,"Scripps Media, Inc.",SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
4,5694,1112464,SCRIPPS BROADCASTING HOLDINGS LLC,SAN DIEGO,,,A,POINT (-117.09836 32.71978),2892,6073003401
...,...,...,...,...,...,...,...,...,...,...
1615,15784,4269322,,LOS ANGELES,,,A,POINT (-118.26114 33.86436),7217,6037543321
1616,15755,4269322,"Stuckey, Cameron",LOS ANGELES,,,A,POINT (-118.33919 33.95345),731,6037600702
1617,15788,4269322,,LOS ANGELES,,,A,POINT (-118.33919 33.95345),731,6037600702
1618,15762,4269322,"Stuckey, Cameron",ALAMEDA,,,A,POINT (-122.20053 37.75158),2008,6001409000


In [146]:
radio_tower_filtered

Unnamed: 0,OBJECTID,CALLSIGN,FREQUENCY,STATUS,CITY,geometry,index_right,tract
0,3,DK201FB,88.1 MHz,LIC,TULARE,POINT (-119.25176 36.22562),5573,6107002402
1,4435,KCRZ,104.9 MHz,LIC,TIPTON,POINT (-119.25205 36.16862),5573,6107002402
2,7,DK203CP,88.5 MHz,LIC,PENRYN,POINT (-121.11184 38.87025),4949,6061020502
3,6035,KKFS,103.9 MHz,LIC,LINCOLN,POINT (-121.12604 38.87575),4949,6061020502
4,13,DK208EI,89.5 MHz,LIC,PORTERVILLE,POINT (-119.06405 36.09392),5870,6107003503
...,...,...,...,...,...,...,...,...
1361,9699,KYRR-LP,93.3 MHz,LIC,NEVADA CITY,POINT (-120.94244 39.31574),9085,6057000801
1362,9789,KZED-LP,107.5 MHz,LIC,LA GRANGE,POINT (-120.35660 37.66248),1898,6043000200
1363,9822,KZIQ-FM,92.7 MHz,LIC,RIDGECREST,POINT (-117.64401 35.61615),8458,6029005410
1364,9947,KZSU,90.1 MHz,LIC,STANFORD,POINT (-122.17914 37.41167),5341,6085511705


In [None]:
# Establish columns to check for duplicates
columns_to_check = ['County', 'PlantName']

# If rows are identical in county and plant name, one last check with the capacity
# if within 3L, the duplicate will be dropped
capacity_threshold = 3.0

# Find duplicate rows based on specified columns
duplicate_mask = ca_power_plants.duplicated(subset=columns_to_check, keep='first')

# Filter rows based on condition on the numeric column
cleaned_power_plants = ca_power_plants[~(duplicate_mask & (ca_power_plants['Capacity_L'].abs() <= capacity_threshold))]

cleaned_power_plants