## Cal-CRAI metric calculation: built environment communication infrastructure
* num of cell towers
* num of radio towers
* num of microwave towers
* num of paging towers
* num of broadcast towers
* num of broadcast providers

In [None]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws#, pull_gpkg_from_directory
from scripts.utils.write_metadata import append_metadata

In [None]:
def pull_gpkg_from_directory(bucket_name, directory):
    """
    Pulls GeoPackage files from a specified directory in an S3 bucket.
    
    Parameters:
    - bucket_name (str): The name of the S3 bucket.
    - directory (str): The directory within the bucket to search for GeoPackage files.
    """
    # Create an S3 client
    s3 = boto3.client('s3')

    # List objects in the specified directory
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)

    # Check if objects were found
    if 'Contents' in response:
        # Iterate through each object found
        for obj in response['Contents']:
            # Get the key (filename) of the object
            key = obj['Key']
            
            # Check if the object is a .gpkg file
            if key.endswith('.gpkg'):
                # Download the GeoPackage file into memory
                gpkg_object = s3.get_object(Bucket=bucket_name, Key=key)
                gpkg_data = io.BytesIO(gpkg_object['Body'].read())
                
                # Save the GeoPackage file locally
                gpkg_filename = os.path.basename(key)
                with open(gpkg_filename, 'wb') as gpkg_file:
                    gpkg_file.write(gpkg_data.getvalue())
                
                print(f"Saved GeoPackage as '{gpkg_filename}' locally")
                # You can now use the saved file for further processing
    else:
        print("No objects found in the specified directory.")


In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/communication_infrastructure/homeland_infrastructure_foundation_level_data/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [None]:
cellular_towers_data = gpd.read_file('built_hifld_cellular_towers.gpkg')
print('complete')
microwave_towers_data = gpd.read_file('built_hifld_microwave_towers.gpkg')
print('complete')
mobile_towers_data = gpd.read_file('built_hifld_mobile_towers.gpkg')
print('complete')
paging_towers_data = gpd.read_file('built_hifld_paging_towers.gpkg')
print('complete')
radio_towers_data = gpd.read_file('built_hifld_radio_towers.gpkg')
print('complete')

In [None]:
microwave_towers_data.columns

In [None]:
def county_count(df, county_col, county, counter):
    county_isolate = df[df[county_col]==county]
    county_isolate_drop_duplicates= county_isolate.drop_duplicates(subset=[county_col, counter])
    print(f'Length of df for {county} county without dropping duplicates:  {len(county_isolate)}')
    print(f'Length of df for {county} county after dropping duplicates: {len(county_isolate_drop_duplicates)}')

county_count(microwave_towers_data, 'LocCounty', 'SANTA CLARA', 'Callsign')

In [None]:
Fresno = paging_towers_data[paging_towers_data['LocCounty']=='FRESNO']
#Fresno = Fresno[Fresno['cellular_towers_data_count'].notna()]
#pd.set_option('display.max_rows', None)  # None means display all rows
iso_fresno = Fresno.drop_duplicates(subset=['LocCounty', 'UniqSysID'])

iso_fresno

bigger file, running later

In [None]:
# tv_contour_data = gpd.read_file('built_hifld_tv_contour.gpkg')

In [None]:
# Read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Select columns and rename
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Modify 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]
filtered_ca_boundaries = filtered_ca_boundaries.to_crs(crs=4269) 
# Output the modified GeoDataFrame
filtered_ca_boundaries

In [None]:
communication_infrastructure_data = [cellular_towers_data, microwave_towers_data, mobile_towers_data, paging_towers_data, radio_towers_data]
words_to_search = ['OBJECTID','UniqSysID', 'USCB_COUNTYFP', 'Licensee', 'CALLSIGN', 'LocCounty', 'CITY', 'AllStruc', 'StrucType', 'LicStatus', 'STATUS', 'FREQUENCY', 'geometry']

def filter_and_spatial_join(data_list, filtered_ca_boundaries, words_to_search, ca_tract_county):
    county_count_dfs = {}
    
    for df, df_name in zip(data_list, ['cellular_towers_data', 'microwave_towers_data', 'mobile_towers_data', 'paging_towers_data', 'radio_towers_data']):        
        # Filter columns based on words_to_search
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df = filtered_df.to_crs(crs=4269)

        # Convert all string columns to lowercase
        str_columns = filtered_df.select_dtypes(include=['object']).columns
        for col in str_columns:
            filtered_df[col] = filtered_df[col].str.lower()

        # Perform the spatial join
        joined_df = gpd.sjoin(filtered_df, filtered_ca_boundaries, how='right', predicate='within')
        
        # Ensure necessary columns are retained
        necessary_columns = ['tract', 'UniqSysID', 'OBJECTID', 'LocCounty', 'USCB_COUNTYFP']
        joined_df = joined_df[[col for col in necessary_columns if col in joined_df.columns]].copy()
                
        # Use 'UniqSysID' if it exists, otherwise use 'OBJECTID'
        if 'UniqSysID' in joined_df.columns:
            id_column = 'UniqSysID'
        elif 'OBJECTID' in joined_df.columns:
            id_column = 'OBJECTID'
        else:
            raise ValueError(f"Neither 'UniqSysID' nor 'OBJECTID' found in the DataFrame for {df_name}")
        
        # Determine county_id
        if 'LocCounty' in joined_df.columns:
            joined_df = joined_df.rename(columns={'LocCounty':'county'})
            county_id = 'county'
        elif 'USCB_COUNTYFP' in joined_df.columns:
            joined_df = joined_df.rename(columns={'USCB_COUNTYFP':'countyfp'})
            county_id = 'countyfp'
        else:
            raise ValueError(f"Neither 'LocCounty' nor 'USCB_COUNTYFP' found in the DataFrame for {df_name}")

        # Remove duplicates based on county and the chosen ID column
        unique_communication_structures_county = joined_df.drop_duplicates(subset=[county_id, id_column])

        county_power_counts = unique_communication_structures_county.groupby(county_id)[id_column].apply(lambda x: x.notnull().sum()).reset_index(name=f"{df_name}_count")
        
        # Merge with ca_tract_county
        merged_df = pd.merge(ca_tract_county, county_power_counts, on=county_id, how='left')
        
        county_df_name = f"county_count_{df_name}"
        
        county_count_dfs[county_df_name] = merged_df
        
        # Dynamically create global variables
        globals()[county_df_name] = merged_df
        
        print(county_df_name)
    
    return county_count_dfs

county_count_dfs = filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search, ca_tract_county)


In [None]:
county_count_microwave_towers_data.head()

In [None]:
communication_infrastructure_data = [county_count_cellular_towers_data
                                    county_count_microwave_towers_data
                                    county_count_mobile_towers_data
                                    county_count_paging_towers_data
                                    county_count_radio_towers_data]

def merge_dfs(data_list, filtered_ca_boundaries, words_to_search):
    county_count_merged_dfs = {}
    
    for df, df_name in zip(data_list, ['cellular_towers_data', 'microwave_towers_data', 'mobile_towers_data', 'paging_towers_data', 'radio_towers_data']):        
        # Filter columns based on words_to_search
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df = filtered_df.to_crs(crs=4269)

In [None]:
merge = pd.merge(ca_tract_county,county_count_paging_towers_data, on='county', how='left')
print(len(merge))
merge.head()

In [None]:
county_count_paging_towers_data