## Cal-CRAI Metric Calculation
Domain: Built Environment \
Indicator: Communication Infrastructure

This notebook calculates 7 metrics, the first six sourced from Homeland Infrastructure Foundation-Level Data:
* Metric 1: Number of cell towers per county
* Metric 2: Number of radio towers per county
* Metric 3: Number of microwave towers per county
* Metric 4: Number of paging towers per county
* Metric 5: Number of mobile towers per county
* Metric 6: Number of tv broadcast providers per county

The last metric is sourced from the California Public Utilities Commission:
* Metric 7: if a census tract has low internet download speeds (<25mbs/s)

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd
import numpy as np

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_csv_from_directory, upload_csv_aws, data_stats_check, pull_gpkg_from_directory,county_count
from scripts.utils.write_metadata import append_metadata

## Pull metric data from the Homeland Infrastructure Foundation-Level Data

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/communication_infrastructure/homeland_infrastructure_foundation_level_data/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [None]:
cellular_towers_data = gpd.read_file('built_hifld_cellular_towers.gpkg')
print('complete')
microwave_towers_data = gpd.read_file('built_hifld_microwave_towers.gpkg')
print('complete')
mobile_towers_data = gpd.read_file('built_hifld_mobile_towers.gpkg')
print('complete')
paging_towers_data = gpd.read_file('built_hifld_paging_towers.gpkg')
print('complete')
radio_towers_data = gpd.read_file('built_hifld_radio_towers.gpkg')
print('complete')
tv_contour_data = gpd.read_file('built_hifld_tv_contour.gpkg')
print('complete')

In [None]:
microwave_towers_data

In [None]:
tv_contour_data.columns

### Call function to take a look at df's and if they have dupicates 

In [None]:
county_count(microwave_towers_data, 'LocCounty', 'SANTA CLARA', 'Callsign')

In [None]:
# Read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Select columns and rename
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Modify 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]
filtered_ca_boundaries = filtered_ca_boundaries.to_crs(crs=4269) 
# Output the modified GeoDataFrame
filtered_ca_boundaries

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

In the following cells, look at columns from each dataset and identify which column is the identifier (noted by comment value)

In [None]:
cellular_towers_data.columns # uniqsysid

In [None]:
microwave_towers_data.columns #callsign

In [None]:
mobile_towers_data.columns #uniqsysid

In [None]:
paging_towers_data.columns # uniqsysid

In [None]:
radio_towers_data.columns # CALLSIGN

### Function to process all dataframes from HIFLD
* specific keywords were identified for each dataset and the function iterates to find the correct column per dataframe
* dataframes are then spatatially joined to CA tract boundary data
* duplicate columns are dropped per county and metric identified columns
* columns are renamed for consistency
* resulting dataframes are saved per their initial dataframe name

In [None]:
communication_infrastructure_data = [cellular_towers_data, microwave_towers_data, mobile_towers_data, paging_towers_data, radio_towers_data, tv_contour_data]
words_to_search = ['OBJECTID','UniqSysID', 'USCB_COUNTYFP', 'ID', 'Licensee', 'CALLSIGN', 'Callsign','LocCounty', 'geometry']

def filter_and_spatial_join(data_list, filtered_ca_boundaries, words_to_search, ca_tract_county):
    county_count_dfs = {}
    
    for df, df_name in zip(data_list, ['cellular_towers_data', 'microwave_towers_data', 'mobile_towers_data', 'paging_towers_data', 'radio_towers_data', 'tv_contour_data']):        
        # Filter columns based on words_to_search
        filtered_df = df[[col for col in df.columns if any(word in col for word in words_to_search)]].copy()
        filtered_df.index = df.index
        filtered_df = filtered_df.to_crs(crs=4269)

        # Convert all string columns to lowercase
        str_columns = filtered_df.select_dtypes(include=['object']).columns
        for col in str_columns:
            filtered_df[col] = filtered_df[col].str.lower()

        # Perform the spatial join
        joined_df = gpd.sjoin(filtered_df, filtered_ca_boundaries, how='right', predicate='within')
        
        # Ensure necessary columns are retained
        necessary_columns = ['Callsign', 'CALLSIGN', 'ID', 'UniqSysID', 'LocCounty', 'USCB_COUNTYFP']
        joined_df = joined_df[[col for col in necessary_columns if col in joined_df.columns]].copy()
                
        # Use 'UniqSysID' if it exists, otherwise use 'OBJECTID'
        if 'UniqSysID' in joined_df.columns:
            id_column = 'UniqSysID'
        elif 'Callsign' in joined_df.columns:
            id_column = 'Callsign'
        elif 'CALLSIGN' in joined_df.columns:
            id_column = 'CALLSIGN'
        elif 'ID' in joined_df.columns:
            id_column = 'ID'
        else:
            raise ValueError(f"Neither 'UniqSysID' nor 'OBJECTID' found in the DataFrame for {df_name}")
        
        # Determine county_id
        if 'LocCounty' in joined_df.columns:
            joined_df = joined_df.rename(columns={'LocCounty':'county'})
            county_id = 'county'
        elif 'USCB_COUNTYFP' in joined_df.columns:
            joined_df = joined_df.rename(columns={'USCB_COUNTYFP':'countyfp'})
            county_id = 'countyfp'
        else:
            raise ValueError(f"Neither 'LocCounty' nor 'USCB_COUNTYFP' found in the DataFrame for {df_name}")

        # Remove duplicates based on county and the chosen ID column
        unique_communication_structures_county = joined_df.drop_duplicates(subset=[county_id, id_column])

        county_power_counts = unique_communication_structures_county.groupby(county_id)[id_column].apply(lambda x: x.notnull().sum()).reset_index(name=f"{df_name}_count")
        
        # Merge with ca_tract_county
        merged_df = pd.merge(ca_tract_county, county_power_counts, on=county_id, how='left')
        
        county_df_name = f"county_count_{df_name}"
        
        county_count_dfs[county_df_name] = merged_df
        
        # Dynamically create global variables
        globals()[county_df_name] = merged_df
        
        print(county_df_name)
    
    return county_count_dfs

county_count_dfs = filter_and_spatial_join(communication_infrastructure_data, filtered_ca_boundaries, words_to_search, ca_tract_county)

### Call function to identify counts per metric per county

In [None]:
data_stats_check(county_count_cellular_towers_data, 'cellular_towers_data_count')
data_stats_check(county_count_microwave_towers_data, 'microwave_towers_data_count')
data_stats_check(county_count_mobile_towers_data, 'mobile_towers_data_count')
data_stats_check(county_count_paging_towers_data, 'paging_towers_data_count')
data_stats_check(county_count_radio_towers_data, 'radio_towers_data_count')
data_stats_check(county_count_tv_contour_data, 'tv_contour_data_count')

### Take a look at LA and its microwave counts to ensure all tracts within a county have the same values

In [None]:
county_count_microwave_towers_data.loc[county_count_microwave_towers_data.county=='los angeles']

### Save df's as csv's for function call below

In [17]:
county_count_cellular_towers_data.to_csv('built_cellular_towers_metric.csv', index=False)
county_count_microwave_towers_data.to_csv('built_microwave_towers_metric.csv', index=False)
county_count_mobile_towers_data.to_csv('built_mobile_towers_metric.csv', index=False)
county_count_paging_towers_data.to_csv('built_paging_towers_metric.csv', index=False)
county_count_radio_towers_data.to_csv('built_radio_towers_metric.csv', index=False)
county_count_tv_contour_data.to_csv('built_tv_contours_metric.csv', index=False)

### Metric calculation for broadband internet metric

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/communication_infrastructure/ca_public_utilities_commission/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [19]:
broadband_internet_data = gpd.read_file('built_cpuc_internet.gpkg')
broadband_internet_data = broadband_internet_data.to_crs(crs=4269) 

In [None]:
broadband_internet_data.columns

In [None]:
broadband_internet_data.plot()

### Select desired columns, rename and adjust them

In [None]:
columns_keep = ['USCB_GEOID', 'MaxAdDn', 'geometry']
filtered_broadband_internet = broadband_internet_data[columns_keep].copy()
filtered_broadband_internet = filtered_broadband_internet.drop_duplicates(subset=['USCB_GEOID', 'MaxAdDn'])
filtered_broadband_internet = filtered_broadband_internet.rename(columns={'USCB_GEOID':'tract'})
filtered_broadband_internet['tract'] = filtered_broadband_internet['tract'].str.lstrip('0')

filtered_broadband_internet

### Take a look at a specified tract and see its broadband download entries

In [None]:
filtered_rows = filtered_broadband_internet[filtered_broadband_internet['tract'] == '6059001304']
filtered_rows

### Filter for download speed below 25mb/s per Cal-CRAI metric

In [None]:
low_broadband_download = filtered_broadband_internet[filtered_broadband_internet['MaxAdDn'] < 25]
low_broadband_download

In [25]:
#low_broadband_download.plot()

### Spatially join the low broadband internet data with our California county data
* add a flag for any tract that contains <25 download speeds

In [None]:
low_broadband_download = low_broadband_download.drop(columns='tract')
joined_df = gpd.sjoin(low_broadband_download, filtered_ca_boundaries, how='right', predicate='within')
joined_df= joined_df.drop_duplicates(subset='tract')
joined_df['low_internet_download_flag'] = np.where(joined_df['MaxAdDn'].notna(), 1, 0)
joined_df = joined_df.drop(columns=['index_left', 'geometry'])

joined_df

### Cleanup the final dataframe to hold only relevant columns

In [None]:
# Define the new order of columns
new_column_order = ['tract', 'MaxAdDn', 'low_internet_download_flag']

# Reassign the DataFrame with the new order of columns
low_internet_merged = joined_df[new_column_order]
low_internet_merged

### Take a look at specified tracts to cross reference with the original broadband data

In [None]:
filtered_rows = joined_df[joined_df['tract'] == '6059001304']
filtered_rows

In [29]:
low_internet_merged.to_csv('built_broadband_internet_metric.csv', index=False)

### Function Call

In [30]:
@append_metadata
def communication_infrastructure_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared metric calculations within the communication infrastructure to the S3 bucket. The metrics are:
    - low broadband speeds of <25 mpbs download
    - # of tv contour towers per county
    - # of microwave towers per county
    - # of radio towers per county
    - # of cellular towers per county
    - # of paging towers per county
    - # of mobile towers per county

    Data for this metric was sourced from Homeland Infrastructure Foundation-Level Data at:
    https://hifld-geoplatform.opendata.arcgis.com/ 
    
    and from the California Public Utilities Commission at https://www.cpuc.ca.gov/industries-and-topics/internet-and-phone/california-advanced-services-fund/project-development-resources---data-and-maps.

    Methods
    -------
    All data were reprojected to Cal-CRAI's standardized coordinate reference system (CRS) 4269.
    Data was then spatially joined to California 2021 Tiger census tract data.
    Columns relevent per metric were maintained and summed to calculate total towers per county.
    For the broadband speed metric, a flag was used to indicate tracts with low internet speeds.
    Data was then merged back to 2021 census tract data so each CA tract has a metric value.
    
    Parameters
    ----------
    input_csv: string
        csv communication metric data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI communication metric to AWS
        True = will upload resulting df containing CAL CRAI communication facility metric to AWS

    Script
    ------
    built_communication_infrastructure.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by isolating and renaming relevant columns.')
    print('Data transformation: data was spatially joined to California census tract and county data.')
    print('Data transformation: an additional column was calculated by summing or flagging a specified column per county.')
 
    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')
 
    if os.path.exists(input_csv):
        os.remove(input_csv)

In [31]:
input_csv = [
            'built_cellular_towers_metric.csv',
            'built_radio_towers_metric.csv',
            'built_mobile_towers_metric.csv',
            'built_paging_towers_metric.csv',
            'built_tv_contours_metric.csv',
            'built_microwave_towers_metric.csv',
            'built_broadband_internet_metric.csv'
]

varnames = [
            'built_hifld_cellular_towers',
            'built_hifld_radio_towers',
            'built_hifld_mobile_towers',
            'built_hifld_paging_towers',
            'built_hifld_tv_contour',
            'built_hifld_microwave_towers',
            'built_cpuc_internet'
]

for csv, var in zip(input_csv, varnames):
    communication_infrastructure_upload(csv, export=True, varname='test')