In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws#, pull_gpkg_from_directory
from scripts.utils.write_metadata import append_metadata

In [2]:
def pull_gpkg_from_directory(bucket_name, directory):
    """
    Pulls GeoPackage files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for GeoPackage files.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .gpkg file
            if key.endswith('.gpkg'):
                # Download the GeoPackage file into memory
                gpkg_object = s3.get_object(Bucket=bucket_name, Key=key)
                gpkg_data = io.BytesIO(gpkg_object['Body'].read())
                
                # Save the GeoPackage file locally
                gpkg_filename = os.path.basename(key)
                with open(gpkg_filename, 'wb') as gpkg_file:
                    gpkg_file.write(gpkg_data.getvalue())
                
                print(f"Saved GeoPackage as '{gpkg_filename}' locally")
                # You can now use the saved file for further processing
    else:
        print("No objects found in the specified directory.")


In [4]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/communication_infrastructure/homeland_infrastructure_foundation_level_data/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_hifld_cellular_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_microwave_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_mobile_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_paging_towers.gpkg' locally
Saved GeoPackage as 'built_hifld_radio_towers.gpkg' locally


In [34]:
cellular_towers_data = gpd.read_file('built_hifld_cellular_towers.gpkg')
print('complete')
microwave_towers_data = gpd.read_file('built_hifld_microwave_towers.gpkg')
print('complete')
mobile_towers_data = gpd.read_file('built_hifld_mobile_towers.gpkg')
print('complete')
paging_towers_data = gpd.read_file('built_hifld_paging_towers.gpkg')
print('complete')
radio_towers_data = gpd.read_file('built_hifld_radio_towers.gpkg')
print('complete')
tv_contour_data = gpd.read_file('built_hifld_tv_contour.gpkg')
print('complete')

complete
complete
complete
complete
complete
complete


In [36]:
tv_contour_data

Unnamed: 0,OBJECTID,CALL,PREFIX,ARN,SERVICE,ID,ID0,PURPOSE,APP_STATUS,DATE,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,15,KVBC,BDSTA,20021022ABM,DS,616756.0,69677.0,,,,...,06071010300,103,Census Tract 103,G5020,S,18005085395,6351750,+35.1478612,-115.8783789,"MULTIPOLYGON (((-114.88092 35.03325, -114.9433..."
1,4618,KINC,BDSTA,20030131AJV,DS,626402.0,67089.0,,,,...,06071010300,103,Census Tract 103,G5020,S,18005085395,6351750,+35.1478612,-115.8783789,"POLYGON ((-115.41001 35.61202, -115.44051 35.6..."
2,5360,KMOH-TV,BDSTA,20030212ABJ,DS,628243.0,24753.0,,,,...,06071010300,103,Census Tract 103,G5020,S,18005085395,6351750,+35.1478612,-115.8783789,"POLYGON ((-115.33194 34.67359, -115.35362 34.7..."
3,7171,KVCR-TV,BDSTA,20040624AEX,DS,1001706.0,58795.0,,,,...,06071010300,103,Census Tract 103,G5020,S,18005085395,6351750,+35.1478612,-115.8783789,"POLYGON ((-116.80197 34.86047, -116.74780 34.8..."
4,8778,KMCC,BDSTA,20040318ACX,DS,986655.0,41237.0,,,,...,06071010300,103,Census Tract 103,G5020,S,18005085395,6351750,+35.1478612,-115.8783789,"POLYGON ((-114.85579 34.94596, -114.86286 34.9..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142143,30581,KNXT,BDSTA,20031210AHO,DS,738318.0,16950.0,,,,...,06107001701,17.01,Census Tract 17.01,G5020,S,2368718,0,+36.3199536,-119.2871729,"POLYGON ((-119.29338 36.32690, -119.29226 36.3..."
142144,13746,KNXT,BDSTA,20031210AHO,DS,738318.0,16950.0,,,,...,06107001703,17.03,Census Tract 17.03,G5020,S,7716437,0,+36.3197803,-119.2516331,"POLYGON ((-119.27877 36.32659, -119.27877 36.3..."
142145,30581,KNXT,BDSTA,20031210AHO,DS,738318.0,16950.0,,,,...,06107001703,17.03,Census Tract 17.03,G5020,S,7716437,0,+36.3197803,-119.2516331,"POLYGON ((-119.27877 36.32659, -119.27877 36.3..."
142146,13746,KNXT,BDSTA,20031210AHO,DS,738318.0,16950.0,,,,...,06107001800,18,Census Tract 18,G5020,S,2554534,0,+36.3198504,-119.3047728,"POLYGON ((-119.31299 36.32699, -119.30995 36.3..."


In [35]:
tv_contour_data.columns

Index(['OBJECTID', 'CALL', 'PREFIX', 'ARN', 'SERVICE', 'ID', 'ID0', 'PURPOSE',
       'APP_STATUS', 'DATE', 'FAC_STATUS', 'DATE0', 'LICENSEE', 'STATUS',
       'STATE', 'CITY', 'CHANNEL', 'LON', 'LAT', 'CONTOUR', 'GlobalID',
       'SHAPE_Leng', 'SHAPE_Area', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [5]:
def county_count(df, county_col, county, counter):
    county_isolate = df[df[county_col]==county]
    county_isolate_drop_duplicates= county_isolate.drop_duplicates(subset=[county_col, counter])
    print(f'Length of df for {county} county without dropping duplicates:  {len(county_isolate)}')
    print(f'Length of df for {county} county after dropping duplicates: {len(county_isolate_drop_duplicates)}')

county_count(microwave_towers_data, 'LocCounty', 'SANTA CLARA', 'Callsign')

Length of df for SANTA CLARA county without dropping duplicates:  1227
Length of df for SANTA CLARA county after dropping duplicates: 1048


In [7]:
Fresno = paging_towers_data[paging_towers_data['LocCounty']=='FRESNO']
#Fresno = Fresno[Fresno['cellular_towers_data_count'].notna()]
#pd.set_option('display.max_rows', None)  # None means display all rows
iso_fresno = Fresno.drop_duplicates(subset=['LocCounty', 'UniqSysID'])

iso_fresno.head()

Unnamed: 0,UniqSysID,Licensee,Callsign,LocNum,LatDeg,LatMin,LatSec,LatDir,LonDeg,LonMin,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
31,2305,"Spok, Inc.",KNKG834,29,37.0,4.0,25.8,N,119.0,25.0,...,6019006409,64.09,Census Tract 64.09,G5020,S,560488497,15925621,37.2058395,-119.2805522,POINT (-119.43039 37.07383)
179,1173,Fresno Mobile Radio Co.,KMA830,1,36.0,18.0,18.8,N,120.0,24.0,...,6019007903,79.03,Census Tract 79.03,G5020,S,1823994219,976972,36.204251,-120.4089333,POINT (-120.40347 36.30522)
270,1180,"AMS Spectrum Holdings, LLC",KMB305,42,37.0,4.0,10.8,N,119.0,25.0,...,6019006409,64.09,Census Tract 64.09,G5020,S,560488497,15925621,37.2058395,-119.2805522,POINT (-119.42761 37.06967)
298,2832,"Vincent Communications, Inc",KNKK227,1,36.0,55.0,48.8,N,119.0,38.0,...,6019006405,64.05,Census Tract 64.05,G5020,S,353743597,11890100,37.0042938,-119.5635349,POINT (-119.63875 36.93022)
300,2834,"Vincent Communications, Inc",KNKK231,1,36.0,44.0,6.8,N,119.0,47.0,...,6019000100,1.0,Census Tract 1,G5020,S,843445,0,36.7369244,-119.7927732,POINT (-119.78708 36.73522)


In [8]:
# Read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Select columns and rename
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Modify 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]
filtered_ca_boundaries = filtered_ca_boundaries.to_crs(crs=4269) 
# Output the modified GeoDataFrame
filtered_ca_boundaries

Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


In [10]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

Unnamed: 0,tract,countyfp,county
0,06085504321,085,santa clara
1,06085504410,085,santa clara
2,06085507003,085,santa clara
3,06085507004,085,santa clara
4,06085502204,085,santa clara
...,...,...,...
9124,06059001303,059,orange
9125,06059001304,059,orange
9126,06059001401,059,orange
9127,06013367200,013,contra costa


In [19]:
cellular_towers_data.columns # uniqsysid

Index(['OBJECTID', 'UniqSysID', 'Licensee', 'Callsign', 'LocNum', 'LatDeg',
       'LatMin', 'LatSec', 'LatDir', 'LonDeg', 'LonMin', 'LonSec', 'LonDir',
       'LocAdd', 'LocCity', 'LocCounty', 'LocState', 'Nepa', 'QZone', 'TowReg',
       'SupStruc', 'AllStruc', 'StrucType', 'LicStatus', 'latdec', 'londec',
       'url', 'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID',
       'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT',
       'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON',
       'geometry'],
      dtype='object')

In [18]:
microwave_towers_data.columns #callsign

Index(['OBJECTID', 'LicID', 'Licensee', 'Callsign', 'LocNum', 'LatDeg',
       'LatMin', 'LatSec', 'LatDir', 'LonDeg', 'LonMin', 'LonSec', 'LonDir',
       'LocAdd', 'LocCity', 'LocCounty', 'LocState', 'Nepa', 'QZone', 'TowReg',
       'SupStruc', 'AllStruc', 'StrucType', 'LicStatus', 'latdec', 'londec',
       'url', 'GlobalID', 'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE',
       'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC',
       'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT',
       'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [17]:
mobile_towers_data.columns #uniqsysid

Index(['OBJECTID', 'UniqSysID', 'Licensee', 'Callsign', 'LocNum', 'LatDeg',
       'LatMin', 'LatSec', 'LatDir', 'LonDeg', 'LonMin', 'LonSec', 'LonDir',
       'LocAdd', 'LocCity', 'LocCounty', 'LocState', 'Nepa', 'QZone', 'TowReg',
       'SupStruc', 'AllStruc', 'StrucType', 'LicStatus', 'latdec', 'londec',
       'url', 'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID',
       'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT',
       'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON',
       'geometry'],
      dtype='object')

In [16]:
paging_towers_data.columns # uniqsysid

Index(['UniqSysID', 'Licensee', 'Callsign', 'LocNum', 'LatDeg', 'LatMin',
       'LatSec', 'LatDir', 'LonDeg', 'LonMin', 'LonSec', 'LonDir', 'LocAdd',
       'LocCity', 'LocCounty', 'LocState', 'Nepa', 'QZone', 'TowReg',
       'SupStruc', 'AllStruc', 'StrucType', 'LicStatus', 'RSC', 'latdec',
       'londec', 'url', 'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE',
       'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC',
       'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT',
       'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [15]:
radio_towers_data.columns # CALLSIGN

Index(['OBJECTID', 'CALLSIGN', 'FREQUENCY', 'SERVICE', 'CLASS', 'STATUS',
       'CITY', 'STATE', 'COUNTRY', 'FILENUM', 'FACID', 'LAT', 'LON',
       'LICENSEE', 'LATDD', 'LONDD', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [40]:
communication_infrastructure_data = [cellular_towers_data, microwave_towers_data, mobile_towers_data, paging_towers_data, radio_towers_data, tv_contour_data]
words_to_search = ['OBJECTID','UniqSysID', 'USCB_COUNTYFP', 'ID', 'Licensee', 'CALLSIGN', 'Callsign','LocCounty', 'geometry']

def filter_and_spatial_join(data_list, filtered_ca_boundaries, words_to_search, ca_tract_county):
    county_count_dfs = {}
    
    for df, df_name in zip(data_list, ['cellular_towers_data', 'microwave_towers_data', 'mobile_towers_data', 'paging_towers_data', 'radio_towers_data', 'tv_contour_data']):        
        # Filter columns based on words_to_search
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df = filtered_df.to_crs(crs=4269)

        # Convert all string columns to lowercase
        str_columns = filtered_df.select_dtypes(include=['object']).columns
        for col in str_columns:
            filtered_df[col] = filtered_df[col].str.lower()

        # Perform the spatial join
        joined_df = gpd.sjoin(filtered_df, filtered_ca_boundaries, how='right', predicate='within')
        
        # Ensure necessary columns are retained
        necessary_columns = ['Callsign', 'CALLSIGN', 'ID', 'UniqSysID', 'LocCounty', 'USCB_COUNTYFP']
        joined_df = joined_df[[col for col in necessary_columns if col in joined_df.columns]].copy()
                
        # Use 'UniqSysID' if it exists, otherwise use 'OBJECTID'
        if 'UniqSysID' in joined_df.columns:
            id_column = 'UniqSysID'
        elif 'Callsign' in joined_df.columns:
            id_column = 'Callsign'
        elif 'CALLSIGN' in joined_df.columns:
            id_column = 'CALLSIGN'
        elif 'ID' in joined_df.columns:
            id_column = 'ID'
        else:
            raise ValueError(f"Neither 'UniqSysID' nor 'OBJECTID' found in the DataFrame for {df_name}")
        
        # Determine county_id
        if 'LocCounty' in joined_df.columns:
            joined_df = joined_df.rename(columns={'LocCounty':'county'})
            county_id = 'county'
        elif 'USCB_COUNTYFP' in joined_df.columns:
            joined_df = joined_df.rename(columns={'USCB_COUNTYFP':'countyfp'})
            county_id = 'countyfp'
        else:
            raise ValueError(f"Neither 'LocCounty' nor 'USCB_COUNTYFP' found in the DataFrame for {df_name}")

        # Remove duplicates based on county and the chosen ID column
        unique_communication_structures_county = joined_df.drop_duplicates(subset=[county_id, id_column])

        county_power_counts = unique_communication_structures_county.groupby(county_id)[id_column].apply(lambda x: x.notnull().sum()).reset_index(name=f"{df_name}_count")
        
        # Merge with ca_tract_county
        merged_df = pd.merge(ca_tract_county, county_power_counts, on=county_id, how='left')
        
        county_df_name = f"county_count_{df_name}"
        
        county_count_dfs[county_df_name] = merged_df
        
        # Dynamically create global variables
        globals()[county_df_name] = merged_df
        
        print(county_df_name)
    
    return county_count_dfs

county_count_dfs = filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search, ca_tract_county)


county_count_cellular_towers_data
county_count_microwave_towers_data
county_count_mobile_towers_data
county_count_paging_towers_data
county_count_radio_towers_data
county_count_tv_contour_data


In [42]:
county_count_cellular_towers_data



Unnamed: 0,tract,countyfp,county,cellular_towers_data_count
0,06085504321,085,santa clara,8
1,06085504410,085,santa clara,8
2,06085507003,085,santa clara,8
3,06085507004,085,santa clara,8
4,06085502204,085,santa clara,8
...,...,...,...,...
9124,06059001303,059,orange,2
9125,06059001304,059,orange,2
9126,06059001401,059,orange,2
9127,06013367200,013,contra costa,4


In [32]:
county_count_paging_towers_data.head()

Unnamed: 0,tract,countyfp,county,paging_towers_data_count
0,6085504321,85,santa clara,11.0
1,6085504410,85,santa clara,11.0
2,6085507003,85,santa clara,11.0
3,6085507004,85,santa clara,11.0
4,6085502204,85,santa clara,11.0
