## Cal-CRAI Metric Calculation
Domain: Built Environment \
Indicator: Transportation Infrastructure
* Metric 1: number of airports per county
* Metric 2: number of bridges per county
* Metric 3: number of road bottlenecks per county
* Metric 4: number of miles of highway per county
* Metric 5:  number of miles of freight rails per county

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [None]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries

In [None]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

### Function to help fact check results for the first three metrics

In [4]:
def county_count(df, county_col, county, counter_list):
    county_isolate = df[df[county_col] == county]
    county_isolate_drop_duplicates = county_isolate.drop_duplicates(subset=[county_col] + counter_list)
    print(f'Length of df for {county} county without dropping duplicates: {len(county_isolate)}')
    print(f'Length of df for {county} county after dropping duplicates: {len(county_isolate_drop_duplicates)}')

In [None]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/transportation/cdot/'

pull_gpkg_from_directory(bucket_name, aws_dir)

In [6]:
airport_data = gpd.read_file('built_caltrans_airports.gpkg')
bridge_data = gpd.read_file('built_caltrans_bridges.gpkg')
highway_data = gpd.read_file('built_caltrans_highways.gpkg')
bottleneck_data = gpd.read_file('built_caltrans_road_bottlenecks.gpkg')
freight_rail_data = gpd.read_file('built_caltrans_rails.gpkg')

### Metric 1: Number of Airports per CA County

In [None]:
airport_data.columns

In [None]:
airport_data.head(5)

Has county column, so running our filter county function
* One misspelled county, fix in the same cell

In [None]:
print('length of airport data before county filter:', len(airport_data))
airport_counties, omitted_airports = filter_counties(airport_data, 'COUNTY', county_list=None)
print('length of aiprort data after county filter:', len(airport_counties))
print('')

print('Omitted rows:')
omitted_airports = omitted_airports['COUNTY']
display(omitted_airports)
print('')

print('fixing spelling issue so airport data is ready for future steps:')
cleaned_airport_data = airport_data
cleaned_airport_data['COUNTY'] = airport_counties["COUNTY"].replace('Humbolt', 'Humboldt')
print('length of cleaned airport data:', len(cleaned_airport_data))

In [None]:
# Convert all string columns to lowercase
str_columns = cleaned_airport_data.select_dtypes(include=['object']).columns
for col in str_columns:
    cleaned_airport_data[col] = cleaned_airport_data[col].str.lower()

# Isolate to relevant columns and drop duplicate rows with same airport and county
columns_to_keep = ['AIRPORTID','COUNTY']
cleaned_airport_data = cleaned_airport_data[columns_to_keep]
unique_airports = cleaned_airport_data.drop_duplicates(subset=['COUNTY', 'AIRPORTID'])

cleaned_airport_data

In [None]:
county_count_airports = cleaned_airport_data.groupby('COUNTY')['AIRPORTID'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_airports = county_count_airports.rename(columns={'AIRPORTID':'number_of_airports', 'COUNTY':'county'})
county_count_airports.head()

Merge California tract/county data with airport county counts

In [None]:
airport_count_tract = pd.merge(ca_tract_county, county_count_airports, on='county', how='left')
airport_count_tract

Can fact check the results with our county count function
* use original airport dataframe and airportid column

In [None]:
county_count(airport_data, 'COUNTY', 'los angeles', ['AIRPORTID'])

Save as a csv for future upload to S3

In [14]:
airport_count_tract.to_csv('built_transportation_airports_metric.csv', index=False)

### Metric Number 2: Number of bridges per CA county

In [None]:
bridge_data.columns

In [None]:
# Convert all string columns to lowercase
str_columns = bridge_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bridge_data[col] = bridge_data[col].str.lower()

columns_to_keep = ['BRIDGE','USCB_COUNTYFP']
cleaned_bridge_data = bridge_data[columns_to_keep]
unique_bridges = cleaned_bridge_data.drop_duplicates(subset=['USCB_COUNTYFP', 'BRIDGE'])

unique_bridges

In [None]:
county_count_bridges = unique_bridges.groupby('USCB_COUNTYFP')['BRIDGE'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bridges = county_count_bridges.rename(columns={'BRIDGE':'numbers_of_bridges', 'USCB_COUNTYFP':'countyfp'})
county_count_bridges.head()

In [None]:
bridge_count_tracts = pd.merge(ca_tract_county, county_count_bridges, on='countyfp', how='left')
bridge_count_tracts

In [None]:
county_count(bridge_data, 'USCB_COUNTYFP', '037', ['BRIDGE'])

In [20]:
bridge_count_tracts.to_csv('built_transportation_bridge_metric.csv', index=False)

### Metric number 3: Number of bottleneck areas per county

In [None]:
bottleneck_data.columns

In [None]:
bottleneck_data.head(5)

As bottlenecks are a bit unique, I selected a few columns when removing duplicates
* If county, direction of traffic, rank of bottleneck, and time of day are all the same between rows,
one will be removed

I also checked number of duplicates for all of the columns to keep and there are no missing data

In [None]:
# Convert all string columns to lowercase
str_columns = bottleneck_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bottleneck_data[col] = bottleneck_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','Rank', 'Direction', 'Shift', 'Name']
cleaned_bottleneck_data = bottleneck_data[columns_to_keep]
unique_bottlenecks = cleaned_bottleneck_data.drop_duplicates(subset=['USCB_COUNTYFP', 'Rank', 'Direction', 'Shift'])
unique_bottlenecks

In [None]:
county_count_bottlenecks = unique_bottlenecks.groupby('USCB_COUNTYFP')['Name'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bottlenecks = county_count_bottlenecks.rename(columns={'Name':'number_of_bottlenecks', 'USCB_COUNTYFP':'countyfp'})
county_count_bottlenecks

In [None]:
bottleneck_count_tracts = pd.merge(ca_tract_county, county_count_bottlenecks, on='countyfp', how='left')
bottleneck_count_tracts

In [None]:
counter = ['Rank', 'Direction', 'Shift']
county_count(bottleneck_data, 'USCB_COUNTYFP', '013', counter_list=counter)

In [27]:
bottleneck_count_tracts.to_csv('built_transportation_bottleneck_metric.csv', index=False)

### Metric number 4: number of miles of highway per county

In [None]:
highway_data.columns

In [None]:
# Convert all string columns to lowercase
str_columns = highway_data.select_dtypes(include=['object']).columns
for col in str_columns:
    highway_data[col] = highway_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','RouteID', 'geometry']
cleaned_highway_data = highway_data[columns_to_keep]
unique_highway = cleaned_highway_data.drop_duplicates(subset=['USCB_COUNTYFP', 'RouteID', 'geometry'])
unique_highway

In [None]:
# have to go back to an equal-area projection to reliably measure geometry length
unique_highway = unique_highway.to_crs("EPSG:3857")
unique_highway

In [None]:
unique_highway["length_in_meters"] = unique_highway.geometry.length
unique_highway

In [None]:
meters_to_miles = 0.000621371 #conversion scalar
highway_miles = unique_highway
highway_miles["highway_length_in_miles"] = highway_miles["length_in_meters"]*meters_to_miles
highway_miles.columns = highway_miles.columns.str.lower()
highway_miles=highway_miles.rename(columns={'uscb_countyfp':'countyfp'})
highway_miles

### To check if the math is correct, I use county fip 17 (el dorado) to visualize the estimated length

In [None]:
el_dorado_before_summing = highway_miles[highway_miles['countyfp'] == '017']
el_dorado_before_summing

### Sum the estimated length of highways by county

In [None]:
# Sum the mileage by county
sum_county_milage = highway_miles.groupby('countyfp')['highway_length_in_miles'].sum().reset_index()

# Merge with ca_tract_county DataFrame
highway_milage_tracts = pd.merge(ca_tract_county, sum_county_milage, on='countyfp', how='left')

# Display the result
highway_milage_tracts

In [None]:
el_dorado_after_summing = highway_milage_tracts[highway_milage_tracts['countyfp'] == '017']
el_dorado_after_summing.head()

In [36]:
highway_milage_tracts.to_csv('built_transportation_highway_metric.csv', index=False)

### Metric number 5: Number of miles of rail tracks per county 
Metadata reference for variable / unit conventions: https://map.dfg.ca.gov/metadata/ds1337.html

In [None]:
freight_rail_data.columns

In [None]:
freight_rail_data.head(5)

In [None]:
# Convert all string columns to lowercase
str_columns = freight_rail_data.select_dtypes(include=['object']).columns
for col in str_columns:
    freight_rail_data[col] = freight_rail_data[col].str.lower()

columns_to_keep = ['OBJECTID','USCB_COUNTYFP', 'geometry']
cleaned_rail_data = freight_rail_data[columns_to_keep]
unique_rail = cleaned_rail_data.drop_duplicates(subset=['OBJECTID', 'USCB_COUNTYFP', 'geometry'])

unique_rail

In [None]:
# have to go back to an equal-area projection to reliably measure geometry length
unique_rail = unique_rail.to_crs("EPSG:3857")
unique_rail

In [None]:
unique_rail["length_in_meters"] = unique_rail.geometry.length
unique_rail

In [None]:
meters_to_miles = 0.000621371 #conversion scalar
rail_miles = unique_rail
rail_miles["railway_length_in_miles"] = rail_miles["length_in_meters"]*meters_to_miles
rail_miles.columns = rail_miles.columns.str.lower()
rail_miles=rail_miles.rename(columns={'uscb_countyfp':'countyfp'})
rail_miles

Look at Los Angeles rows that are to be summed

In [None]:
la_before_summing = rail_miles[rail_miles['countyfp'] == '037']

la_before_summing

In [None]:
# Sum the mileage by county
sum_county_rail_miles = rail_miles.groupby('countyfp')['railway_length_in_miles'].sum().reset_index()

# Merge with ca_tract_county DataFrame
rail_milage_tracts = pd.merge(ca_tract_county, sum_county_rail_miles, on='countyfp', how='left')

# Display the result
rail_milage_tracts.head()

Looking at Los Angeles sums

In [None]:
la_before_summing = rail_milage_tracts[rail_milage_tracts['countyfp'] == '037']
la_before_summing.head()

In [None]:
rail_milage_tracts

In [47]:
rail_milage_tracts.to_csv('built_transportation_rail_metric.csv', index=False)

### Function call to upload to AWS

In [48]:
@append_metadata
def transportation_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared transportation metric csvs to S3 bucket. Uploaded files are from the following metrics:
    * number of airports per county
    * number of bridges per county
    * number of road bottlenecks per county
    * number of miles of highway per county
    * number of miles of freight rails per county

    Data for all transportation metrics was sourced from California Department of Transportation at:
    https://gisdata-caltrans.opendata.arcgis.com/

    Methods
    -------
    Duplicate data were removed based on the metric data's location and metric identifier(s).
    Relevant metric columns were isolated.
    Metrics that calculated milage estimated length of that metric by identifying the length of an entries
    geometry columm, converting the length to estimated miles, and summing all entries within the same county.
    Metrics that calculated number of metrics per county grouped the data by county columns, and counted occurences.
    Data was then merged to California 2021 census tracts, with counts by county being retained for each tract.
    
    Parameters
    ----------
    input_csv: string
        csv economic data 
    export: True/False boolean
        False = will exclusively generate the resulting metadata files for each transportation metric
        True = will upload resulting df containing CAL CRAI transportation metrics to AWS

    Script
    ------
    built_transportation.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by removing duplicate rows and isolating relevant columns.')
    print('Data transformation: entries were summed per county for total number metrics.')
    print('Data transformation: estimated milage metrics were reprojected to EPSG:3857.')
    print('Data transformation: new columns calculated estimated milage per county metrics.')
    print('Data transformation: data was merged to California census tracts.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [49]:
input_csv = ['built_transportation_airports_metric.csv',
            'built_transportation_bottleneck_metric.csv',
            'built_transportation_bridge_metric.csv',
            'built_transportation_highway_metric.csv',
            'built_transportation_rail_metric.csv'
            ]

varnames = [
    'built_caltrans_airports',
    'built_caltrans_road_bottlenecks',
    'built_caltrans_bridges',
    'built_caltrans_highways',
    'built_caltrans_rails'
]

for csv, var in zip(input_csv, varnames):
    transportation_upload(csv, export=True, varname='test')