## Built Transportation Metric Calculation
Each metric is calculated separately. Resulting csvs are simultaneously uploaded to S3 in a function call at the bottom of this notebook
* Metric 1: number of airports per county
* Metric 2: number of bridges per county
* Metric 3: number of road bottlenecks per county
* Metric 4: number of miles of highway per county
* Metric 5:  number of miles of freight rails per county

METRICS 4 & 5 need some more work, their values appear to be too high, the columns I used to calculate milage dont have clear units or what they are doing

In [1]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [2]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries

Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


In [3]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

  ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)


Unnamed: 0,tract,countyfp,county
0,06085504321,085,santa clara
1,06085504410,085,santa clara
2,06085507003,085,santa clara
3,06085507004,085,santa clara
4,06085502204,085,santa clara
...,...,...,...
9124,06059001303,059,orange
9125,06059001304,059,orange
9126,06059001401,059,orange
9127,06013367200,013,contra costa


### Function to help fact check results for the first three metrics

In [4]:
def county_count(df, county_col, county, counter_list):
    county_isolate = df[df[county_col] == county]
    county_isolate_drop_duplicates = county_isolate.drop_duplicates(subset=[county_col] + counter_list)
    print(f'Length of df for {county} county without dropping duplicates: {len(county_isolate)}')
    print(f'Length of df for {county} county after dropping duplicates: {len(county_isolate_drop_duplicates)}')

In [5]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/transportation/cdot/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_caltrans_airports.gpkg' locally
Saved GeoPackage as 'built_caltrans_bridges.gpkg' locally
Saved GeoPackage as 'built_caltrans_highways.gpkg' locally
Saved GeoPackage as 'built_caltrans_rails.gpkg' locally
Saved GeoPackage as 'built_caltrans_road_bottlenecks.gpkg' locally


In [6]:
airport_data = gpd.read_file('built_caltrans_airports.gpkg')
bridge_data = gpd.read_file('built_caltrans_bridges.gpkg')
highway_data = gpd.read_file('built_caltrans_highways.gpkg')
bottleneck_data = gpd.read_file('built_caltrans_road_bottlenecks.gpkg')
freight_rail_data = gpd.read_file('built_caltrans_rails.gpkg')

### Metric 1: Number of Airports per CA County

In [7]:
airport_data.columns

Index(['OBJECTID', 'FACILITY', 'MAPLABEL', 'AIRPORTID', 'CITY', 'COUNTY',
       'FNCTNLCLSS', 'STATECLASS', 'FAASRVCLVL', 'FAASITENO', 'DISTRICT',
       'MANAGER', 'MNGREMAIL', 'PHONE', 'F5010URL', 'LATDD', 'LONGDD',
       'LATDMS', 'PMTLAT', 'LONGDMS', 'ARPLATDMS', 'ARPLONGDMS', 'PMTLONG',
       'USCB_STATEFP', 'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID',
       'USCB_NAME', 'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT',
       'USCB_ALAND', 'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON',
       'geometry'],
      dtype='object')

In [8]:
airport_data.head(5)

Unnamed: 0,OBJECTID,FACILITY,MAPLABEL,AIRPORTID,CITY,COUNTY,FNCTNLCLSS,STATECLASS,FAASRVCLVL,FAASITENO,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,ADIN AIRPORT,ADIN,A26,Adin,Modoc,Limited Use,LIMITED USE,Not a NPIAS Facility,01217.*A,...,6049000200,2.0,Census Tract 2,G5020,S,2409364166,66821707,41.5108055,-121.2486404,POINT (-120.95439 41.18650)
1,221,TULELAKE AIRPORT,TULELAKE,O81,Tulelake,Modoc,Community,COMMUNITY-Agriculture,General Aviation,02375.*A,...,6049000200,2.0,Census Tract 2,G5020,S,2409364166,66821707,41.5108055,-121.2486404,POINT (-121.36067 41.89064)
2,2,AGUA CALIENTE SPRINGS AIRPORT,AGUA CALIENTE SPRINGS,L54,Agua Caliente Springs,San Diego,Limited Use,LIMITED USE,Not a NPIAS Facility,01218.*A,...,6073021001,210.01,Census Tract 210.01,G5020,S,1635671050,7637,32.9674126,-116.2802017,POINT (-116.29472 32.95570)
3,3,AGUA DULCE AIRPARK,AGUA DULCE,L70,Agua Dulce,Los Angeles,Limited Use,LIMITED USE,Not a NPIAS Facility,01219.*A,...,6037910814,9108.14,Census Tract 9108.14,G5020,S,80230866,27686,34.515942,-118.3096339,POINT (-118.31464 34.50257)
4,4,ALPINE COUNTY AIRPORT,ALPINE COUNTY,M45,Markleeville,Alpine,Limited Use,LIMITED USE,Not a NPIAS Facility,01866.4*A,...,6003010000,100.0,Census Tract 100,G5020,S,1912292607,12557304,38.6217831,-119.7983522,POINT (-119.76705 38.73474)


Has county column, so running our filter county function
* One misspelled county, fix in the same cell

In [9]:
print('length of airport data before county filter:', len(airport_data))
airport_counties, omitted_airports = filter_counties(airport_data, 'COUNTY', county_list=None)
print('length of aiprort data after county filter:', len(airport_counties))
print('')

print('Omitted rows:')
omitted_airports = omitted_airports['COUNTY']
display(omitted_airports)
print('')

print('fixing spelling issue so airport data is ready for future steps:')
cleaned_airport_data = airport_data
cleaned_airport_data['COUNTY'] = airport_counties["COUNTY"].replace('Humbolt', 'Humboldt')
print('length of cleaned airport data:', len(cleaned_airport_data))

length of airport data before county filter: 242
length of aiprort data after county filter: 241

Omitted rows:


189    Humbolt
Name: COUNTY, dtype: object


fixing spelling issue so airport data is ready for future steps:
length of cleaned airport data: 242


In [10]:
# Convert all string columns to lowercase
str_columns = cleaned_airport_data.select_dtypes(include=['object']).columns
for col in str_columns:
    cleaned_airport_data[col] = cleaned_airport_data[col].str.lower()

# Isolate to relevant columns and drop duplicate rows with same airport and county
columns_to_keep = ['AIRPORTID','COUNTY']
cleaned_airport_data = cleaned_airport_data[columns_to_keep]
unique_airports = cleaned_airport_data.drop_duplicates(subset=['COUNTY', 'AIRPORTID'])

cleaned_airport_data

Unnamed: 0,AIRPORTID,COUNTY
0,a26,modoc
1,o81,modoc
2,l54,san diego
3,l70,los angeles
4,m45,alpine
...,...,...
237,o42,tulare
238,dwa,yolo
239,myv,yuba
240,l22,san bernardino


In [11]:
county_count_airports = cleaned_airport_data.groupby('COUNTY')['AIRPORTID'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_airports = county_count_airports.rename(columns={'AIRPORTID':'number_of_airports', 'COUNTY':'county'})
county_count_airports.head()

Unnamed: 0,county,number_of_airports
0,alameda,3
1,alpine,1
2,amador,1
3,butte,2
4,calaveras,1


Merge California tract/county data with airport county counts

In [12]:
airport_count_tract = pd.merge(ca_tract_county, county_count_airports, on='county', how='left')
airport_count_tract

Unnamed: 0,tract,countyfp,county,number_of_airports
0,06085504321,085,santa clara,4.0
1,06085504410,085,santa clara,4.0
2,06085507003,085,santa clara,4.0
3,06085507004,085,santa clara,4.0
4,06085502204,085,santa clara,4.0
...,...,...,...,...
9124,06059001303,059,orange,2.0
9125,06059001304,059,orange,2.0
9126,06059001401,059,orange,2.0
9127,06013367200,013,contra costa,2.0


Can fact check the results with our county count function
* use original airport dataframe and airportid column

In [13]:
county_count(airport_data, 'COUNTY', 'los angeles', ['AIRPORTID'])

Length of df for los angeles county without dropping duplicates: 14
Length of df for los angeles county after dropping duplicates: 14


Save as a csv for future upload to S3

In [14]:
airport_count_tract.to_csv('built_transportation_airports_metric.csv', index=False)

### Metric Number 2: Number of bridges per CA county

In [15]:
bridge_data.columns

Index(['OBJECTID', 'DIST', 'CO', 'RTE', 'PM', 'BRIDGE', 'BRIDGE_X', 'BRIDGE_Y',
       'CITY', 'LAT', 'LON', 'NAME', 'LOC', 'YRBLT', 'HST', 'FAC', 'APWID',
       'LENG', 'DK_AREA', 'LSW', 'RSW', 'RDW', 'REFVCU', 'VCU', 'MAINSPANS',
       'DIR', 'PRINC', 'INTERSEC', 'AADT', 'PCTTRK', 'DEF', 'NHS',
       'FUNCTIONAL', 'DATA_EXTRA', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [16]:
# Convert all string columns to lowercase
str_columns = bridge_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bridge_data[col] = bridge_data[col].str.lower()

columns_to_keep = ['BRIDGE','USCB_COUNTYFP']
cleaned_bridge_data = bridge_data[columns_to_keep]
unique_bridges = cleaned_bridge_data.drop_duplicates(subset=['USCB_COUNTYFP', 'BRIDGE'])

unique_bridges

Unnamed: 0,BRIDGE,USCB_COUNTYFP
0,01c0001,015
1,01c0002,015
2,01c0011,015
3,01c0012,015
4,01c0020,015
...,...,...
13327,58c0150,025
13328,58c0161,025
13329,58c0211,025
13330,58c0215,025


In [17]:
county_count_bridges = unique_bridges.groupby('USCB_COUNTYFP')['BRIDGE'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bridges = county_count_bridges.rename(columns={'BRIDGE':'numbers_of_bridges', 'USCB_COUNTYFP':'countyfp'})
county_count_bridges.head()

Unnamed: 0,countyfp,numbers_of_bridges
0,1,302
1,3,12
2,5,42
3,7,306
4,9,69


In [18]:
bridge_count_tracts = pd.merge(ca_tract_county, county_count_bridges, on='countyfp', how='left')
bridge_count_tracts

Unnamed: 0,tract,countyfp,county,numbers_of_bridges
0,06085504321,085,santa clara,508
1,06085504410,085,santa clara,508
2,06085507003,085,santa clara,508
3,06085507004,085,santa clara,508
4,06085502204,085,santa clara,508
...,...,...,...,...
9124,06059001303,059,orange,586
9125,06059001304,059,orange,586
9126,06059001401,059,orange,586
9127,06013367200,013,contra costa,362


In [19]:
county_count(bridge_data, 'USCB_COUNTYFP', '037', ['BRIDGE'])

Length of df for 037 county without dropping duplicates: 1764
Length of df for 037 county after dropping duplicates: 1764


In [20]:
bridge_count_tracts.to_csv('built_transportation_bridge_metric.csv', index=False)

### Metric number 3: Number of bottleneck areas per county

In [21]:
bottleneck_data.columns

Index(['OBJECTID', 'District', 'Rank', 'County', 'Name', 'Type', 'Shift',
       'Fwy', 'Abs_PM', 'CA_PM', 'Number_Day', 'Avg_Extent', 'Total_Dela',
       'Direction', 'Abs_PM_ups', 'Shape_Leng', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [22]:
bottleneck_data.head(5)

Unnamed: 0,OBJECTID,District,Rank,County,Name,Type,Shift,Fwy,Abs_PM,CA_PM,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,3,1,PLA,EB Douglas Blvd,ML,PM,I80-W,103.38,1.876,...,6061021003,210.03,Census Tract 210.03,G5020,S,7618633,0,38.7645705,-121.2757092,"LINESTRING (-121.25474 38.76627, -121.25586 38..."
1,1,3,1,PLA,EB Douglas Blvd,ML,PM,I80-W,103.38,1.876,...,6061020806,208.06,Census Tract 208.06,G5020,S,2091983,0,38.7453927,-121.2762314,"LINESTRING (-121.26306 38.75607, -121.26327 38..."
2,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,6067005402,54.02,Census Tract 54.02,G5020,S,5481123,238328,38.5897818,-121.432067,"MULTILINESTRING ((-121.44381 38.59654, -121.44..."
3,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,6067005502,55.02,Census Tract 55.02,G5020,S,2868054,0,38.6046274,-121.4270026,"MULTILINESTRING ((-121.43873 38.60075, -121.43..."
4,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,6067006202,62.02,Census Tract 62.02,G5020,S,1627293,0,38.6163928,-121.4222802,"LINESTRING (-121.42688 38.61119, -121.42688 38..."


As bottlenecks are a bit unique, I selected a few columns when removing duplicates
* If county, direction of traffic, rank of bottleneck, and time of day are all the same between rows,
one will be removed

I also checked number of duplicates for all of the columns to keep and there are no missing data

In [23]:
# Convert all string columns to lowercase
str_columns = bottleneck_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bottleneck_data[col] = bottleneck_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','Rank', 'Direction', 'Shift', 'Name']
cleaned_bottleneck_data = bottleneck_data[columns_to_keep]
unique_bottlenecks = cleaned_bottleneck_data.drop_duplicates(subset=['USCB_COUNTYFP', 'Rank', 'Direction', 'Shift'])
unique_bottlenecks

Unnamed: 0,USCB_COUNTYFP,Rank,Direction,Shift,Name
0,061,1,w,pm,eb douglas blvd
2,067,2,s,pm,eb exposition blvd
5,067,3,s,pm,99sb at cosumnes (calvine rd)
8,067,4,e,pm,16th street
10,115,5,e,pm,70eb yuba river br
...,...,...,...,...,...
384,059,6,s,am,brookhurst2
392,059,7,s,am,main 1
394,059,8,n,pm,taft
399,059,9,e,pm,west of gypsum


In [24]:
county_count_bottlenecks = unique_bottlenecks.groupby('USCB_COUNTYFP')['Name'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bottlenecks = county_count_bottlenecks.rename(columns={'Name':'number_of_bottlenecks', 'USCB_COUNTYFP':'countyfp'})
county_count_bottlenecks

Unnamed: 0,countyfp,number_of_bottlenecks
0,1,5
1,13,2
2,17,1
3,19,7
4,29,1
5,37,9
6,39,2
7,53,4
8,59,10
9,61,2


In [25]:
bottleneck_count_tracts = pd.merge(ca_tract_county, county_count_bottlenecks, on='countyfp', how='left')
bottleneck_count_tracts

Unnamed: 0,tract,countyfp,county,number_of_bottlenecks
0,06085504321,085,santa clara,3.0
1,06085504410,085,santa clara,3.0
2,06085507003,085,santa clara,3.0
3,06085507004,085,santa clara,3.0
4,06085502204,085,santa clara,3.0
...,...,...,...,...
9124,06059001303,059,orange,10.0
9125,06059001304,059,orange,10.0
9126,06059001401,059,orange,10.0
9127,06013367200,013,contra costa,2.0


In [26]:
counter = ['Rank', 'Direction', 'Shift']
county_count(bottleneck_data, 'USCB_COUNTYFP', '013', counter_list=counter)

Length of df for 013 county without dropping duplicates: 7
Length of df for 013 county after dropping duplicates: 2


In [27]:
bottleneck_count_tracts.to_csv('built_transportation_bottleneck_metric.csv', index=False)

### Metric number 4: number of miles of highway per county

In [28]:
highway_data.columns

Index(['OBJECTID', 'RouteID', 'FromARMeas', 'ToARMeasur', 'NHS_TYPE',
       'GlobalID', 'Shape_Leng', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [29]:
# Convert all string columns to lowercase
str_columns = highway_data.select_dtypes(include=['object']).columns
for col in str_columns:
    highway_data[col] = highway_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','RouteID', 'geometry']
cleaned_highway_data = highway_data[columns_to_keep]
unique_highway = cleaned_highway_data.drop_duplicates(subset=['USCB_COUNTYFP', 'RouteID', 'geometry'])
unique_highway

Unnamed: 0,USCB_COUNTYFP,RouteID,geometry
0,013,shs_004._p,"LINESTRING Z (-121.65454 37.89649 0.00000, -12..."
1,013,cc_co_byron hwy_p,"LINESTRING Z (-121.64132 37.90124 0.00000, -12..."
2,013,cc_co_brentwood blvd_p,MULTILINESTRING Z ((-121.64177 37.90981 0.0000...
3,013,cc_co_brentwood blvd_s,"LINESTRING Z (-121.69610 37.95002 0.00000, -12..."
4,013,cc_co_brentwood blvd_s,"LINESTRING Z (-121.69614 37.95524 0.00000, -12..."
...,...,...,...
25595,059,shs_073._s,"LINESTRING Z (-117.72719 33.58459 0.00000, -11..."
25596,071,shs_210._s,"LINESTRING Z (-117.26116 34.14408 0.00000, -11..."
25597,071,shs_210._s,"LINESTRING Z (-117.53699 34.13663 0.00000, -11..."
25598,009,cal_co_winton rd_p,"LINESTRING Z (-120.50390 38.40943 0.00000, -12..."


In [30]:
# have to go back to an equal-area projection to reliably measure geometry length
unique_highway = unique_highway.to_crs("EPSG:3857")
unique_highway

Unnamed: 0,USCB_COUNTYFP,RouteID,geometry
0,013,shs_004._p,"LINESTRING Z (-13542521.473 4564813.516 0.000,..."
1,013,cc_co_byron hwy_p,"LINESTRING Z (-13541049.296 4565483.188 0.000,..."
2,013,cc_co_brentwood blvd_p,MULTILINESTRING Z ((-13541099.797 4566692.956 ...
3,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547148.219 4572367.095 0.000,..."
4,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547151.967 4573104.898 0.000,..."
...,...,...,...
25595,059,shs_073._s,"LINESTRING Z (-13105330.358 3973158.613 0.000,..."
25596,071,shs_210._s,"LINESTRING Z (-13053452.733 4048164.936 0.000,..."
25597,071,shs_210._s,"LINESTRING Z (-13084157.548 4047162.377 0.000,..."
25598,009,cal_co_winton rd_p,"LINESTRING Z (-13414433.231 4637428.395 0.000,..."


In [31]:
unique_highway["length_in_meters"] = unique_highway.geometry.length
unique_highway

Unnamed: 0,USCB_COUNTYFP,RouteID,geometry,length_in_meters
0,013,shs_004._p,"LINESTRING Z (-13542521.473 4564813.516 0.000,...",556.751164
1,013,cc_co_byron hwy_p,"LINESTRING Z (-13541049.296 4565483.188 0.000,...",1219.171396
2,013,cc_co_brentwood blvd_p,MULTILINESTRING Z ((-13541099.797 4566692.956 ...,5814.301713
3,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547148.219 4572367.095 0.000,...",4.440995
4,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547151.967 4573104.898 0.000,...",2.235343
...,...,...,...,...
25595,059,shs_073._s,"LINESTRING Z (-13105330.358 3973158.613 0.000,...",765.802662
25596,071,shs_210._s,"LINESTRING Z (-13053452.733 4048164.936 0.000,...",2076.195990
25597,071,shs_210._s,"LINESTRING Z (-13084157.548 4047162.377 0.000,...",34.031269
25598,009,cal_co_winton rd_p,"LINESTRING Z (-13414433.231 4637428.395 0.000,...",3557.431882


In [32]:
meters_to_miles = 0.000621371 #conversion scalar
highway_miles = unique_highway
highway_miles["highway_length_in_miles"] = highway_miles["length_in_meters"]*meters_to_miles
highway_miles.columns = highway_miles.columns.str.lower()
highway_miles=highway_miles.rename(columns={'uscb_countyfp':'countyfp'})
highway_miles

Unnamed: 0,countyfp,routeid,geometry,length_in_meters,highway_length_in_miles
0,013,shs_004._p,"LINESTRING Z (-13542521.473 4564813.516 0.000,...",556.751164,0.345949
1,013,cc_co_byron hwy_p,"LINESTRING Z (-13541049.296 4565483.188 0.000,...",1219.171396,0.757558
2,013,cc_co_brentwood blvd_p,MULTILINESTRING Z ((-13541099.797 4566692.956 ...,5814.301713,3.612838
3,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547148.219 4572367.095 0.000,...",4.440995,0.002760
4,013,cc_co_brentwood blvd_s,"LINESTRING Z (-13547151.967 4573104.898 0.000,...",2.235343,0.001389
...,...,...,...,...,...
25595,059,shs_073._s,"LINESTRING Z (-13105330.358 3973158.613 0.000,...",765.802662,0.475848
25596,071,shs_210._s,"LINESTRING Z (-13053452.733 4048164.936 0.000,...",2076.195990,1.290088
25597,071,shs_210._s,"LINESTRING Z (-13084157.548 4047162.377 0.000,...",34.031269,0.021146
25598,009,cal_co_winton rd_p,"LINESTRING Z (-13414433.231 4637428.395 0.000,...",3557.431882,2.210485


### To check if the math is correct, I use county fip 17 (el dorado) to visualize the estimated length

In [33]:
el_dorado_before_summing = highway_miles[highway_miles['countyfp'] == '017']
el_dorado_before_summing

Unnamed: 0,countyfp,routeid,geometry,length_in_meters,highway_length_in_miles
8824,017,shs_088._p,"LINESTRING Z (-13371916.883 4674334.020 0.000,...",145.489419,0.090403
8825,017,shs_088._s,"LINESTRING Z (-13371916.883 4674334.020 0.000,...",145.489419,0.090403
8826,017,shs_050._p,MULTILINESTRING Z ((-13413247.364 4688295.231 ...,27680.992227,17.200166
8827,017,shs_050._s,MULTILINESTRING Z ((-13408204.409 4688732.295 ...,24942.429367,15.498502
8828,017,shs_088._p,MULTILINESTRING Z ((-13407334.995 4654400.211 ...,8438.416855,5.243388
...,...,...,...,...,...
25474,017,shs_050._s,"LINESTRING Z (-13357310.536 4710561.671 0.000,...",3516.185558,2.184856
25475,017,shs_050._s,"LINESTRING Z (-13357907.508 4710053.287 0.000,...",784.114022,0.487226
25476,017,shs_050._s,"LINESTRING Z (-13358695.389 4709384.664 0.000,...",1033.591085,0.642244
25556,017,shs_049._p,"LINESTRING Z (-13474104.980 4709513.886 0.000,...",91.644083,0.056945


### Sum the estimated length of highways by county

In [34]:
# Sum the mileage by county
sum_county_milage = highway_miles.groupby('countyfp')['highway_length_in_miles'].sum().reset_index()

# Merge with ca_tract_county DataFrame
highway_milage_tracts = pd.merge(ca_tract_county, sum_county_milage, on='countyfp', how='left')

# Display the result
highway_milage_tracts

Unnamed: 0,tract,countyfp,county,highway_length_in_miles
0,06085504321,085,santa clara,1141.553276
1,06085504410,085,santa clara,1141.553276
2,06085507003,085,santa clara,1141.553276
3,06085507004,085,santa clara,1141.553276
4,06085502204,085,santa clara,1141.553276
...,...,...,...,...
9124,06059001303,059,orange,1873.600476
9125,06059001304,059,orange,1873.600476
9126,06059001401,059,orange,1873.600476
9127,06013367200,013,contra costa,649.631654


In [35]:
el_dorado_after_summing = highway_milage_tracts[highway_milage_tracts['countyfp'] == '017']
el_dorado_after_summing.head()

Unnamed: 0,tract,countyfp,county,highway_length_in_miles
163,6017030810,17,el dorado,251.973715
164,6017030301,17,el dorado,251.973715
165,6017030302,17,el dorado,251.973715
166,6017030808,17,el dorado,251.973715
167,6017030809,17,el dorado,251.973715


In [36]:
highway_milage_tracts.to_csv('built_transportation_highway_metric.csv', index=False)

### Metric number 5: Number of miles of rail tracks per county 
Metadata reference for variable / unit conventions: https://map.dfg.ca.gov/metadata/ds1337.html

In [37]:
freight_rail_data.columns

Index(['OBJECTID', 'BEGIN_MP', 'END_MP', 'ROW_OWNER', 'FREIGHT_OP',
       'SUBDIVISIO', 'SHRTLN_NAM', 'SHRTLN_COD', 'PASS_OP', 'PASS_NETWO',
       'COMM_OP', 'COMM_NETWO', 'RECR_OP', 'RECR_NETWO', 'STATUS', 'RR_CLSS',
       'TRK_CLSS', 'PASS_SPEED', 'FRT_SPEED', 'FRT_DNS', 'NUM_TRACK',
       'STRACNET', 'CR63', 'SHAPE_LENG', 'Shape_Le_1', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [38]:
freight_rail_data.head(5)

Unnamed: 0,OBJECTID,BEGIN_MP,END_MP,ROW_OWNER,FREIGHT_OP,SUBDIVISIO,SHRTLN_NAM,SHRTLN_COD,PASS_OP,PASS_NETWO,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,4.1,7.9,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,6037535300,5353,Census Tract 5353,G5020,S,1080457,0,33.9615844,-118.2361363,"LINESTRING (-118.23308 33.96746, -118.23291 33..."
1,11,0.4,4.1,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,6037535300,5353,Census Tract 5353,G5020,S,1080457,0,33.9615844,-118.2361363,"LINESTRING (-118.23308 33.96747, -118.23308 33..."
2,1842,15.9,17.1,UP,UP,Patata Industrial Lead,,,,,...,6037535300,5353,Census Tract 5353,G5020,S,1080457,0,33.9615844,-118.2361363,"LINESTRING (-118.23073 33.95777, -118.23071 33..."
3,1844,15.5,15.9,UP,UP,Patata Industrial Lead,,,,,...,6037535300,5353,Census Tract 5353,G5020,S,1080457,0,33.9615844,-118.2361363,"LINESTRING (-118.23088 33.95868, -118.23089 33..."
4,1,4.1,7.9,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,6037535400,5354,Census Tract 5354,G5020,S,771064,0,33.953754,-118.2348542,"LINESTRING (-118.23091 33.95776, -118.22957 33..."


In [39]:
# Convert all string columns to lowercase
str_columns = freight_rail_data.select_dtypes(include=['object']).columns
for col in str_columns:
    freight_rail_data[col] = freight_rail_data[col].str.lower()

columns_to_keep = ['OBJECTID','USCB_COUNTYFP', 'geometry']
cleaned_rail_data = freight_rail_data[columns_to_keep]
unique_rail = cleaned_rail_data.drop_duplicates(subset=['OBJECTID', 'USCB_COUNTYFP', 'geometry'])

unique_rail

Unnamed: 0,OBJECTID,USCB_COUNTYFP,geometry
0,1,037,"LINESTRING (-118.23308 33.96746, -118.23291 33..."
1,11,037,"LINESTRING (-118.23308 33.96747, -118.23308 33..."
2,1842,037,"LINESTRING (-118.23073 33.95777, -118.23071 33..."
3,1844,037,"LINESTRING (-118.23088 33.95868, -118.23089 33..."
4,1,037,"LINESTRING (-118.23091 33.95776, -118.22957 33..."
...,...,...,...
5373,2368,037,"LINESTRING (-118.17218 33.98028, -118.17211 33..."
5374,2372,067,"LINESTRING (-121.40933 38.63491, -121.40917 38..."
5375,2379,073,"LINESTRING (-117.09272 32.60116, -117.09264 32..."
5376,2404,023,"MULTILINESTRING ((-124.09100 40.87620, -124.09..."


In [40]:
# have to go back to an equal-area projection to reliably measure geometry length
unique_rail = unique_rail.to_crs("EPSG:3857")
unique_rail

Unnamed: 0,OBJECTID,USCB_COUNTYFP,geometry
0,1,037,"LINESTRING (-13161646.100 4024432.980, -131616..."
1,11,037,"LINESTRING (-13161646.278 4024433.978, -131616..."
2,1842,037,"LINESTRING (-13161384.395 4023132.599, -131613..."
3,1844,037,"LINESTRING (-13161401.790 4023254.582, -131614..."
4,1,037,"LINESTRING (-13161404.154 4023130.614, -131612..."
...,...,...,...
5373,2368,037,"LINESTRING (-13154867.442 4026154.428, -131548..."
5374,2372,067,"LINESTRING (-13515225.038 4669509.879, -135152..."
5375,2379,073,"LINESTRING (-13034701.639 3842484.098, -130346..."
5376,2404,023,"MULTILINESTRING ((-13813746.437 4994099.374, -..."


In [41]:
unique_rail["length_in_meters"] = unique_rail.geometry.length
unique_rail

Unnamed: 0,OBJECTID,USCB_COUNTYFP,geometry,length_in_meters
0,1,037,"LINESTRING (-13161646.100 4024432.980, -131616...",1324.679871
1,11,037,"LINESTRING (-13161646.278 4024433.978, -131616...",1.013509
2,1842,037,"LINESTRING (-13161384.395 4023132.599, -131613...",9.924508
3,1844,037,"LINESTRING (-13161401.790 4023254.582, -131614...",123.558518
4,1,037,"LINESTRING (-13161404.154 4023130.614, -131612...",1289.190452
...,...,...,...,...
5373,2368,037,"LINESTRING (-13154867.442 4026154.428, -131548...",919.563095
5374,2372,067,"LINESTRING (-13515225.038 4669509.879, -135152...",727.361209
5375,2379,073,"LINESTRING (-13034701.639 3842484.098, -130346...",454.623650
5376,2404,023,"MULTILINESTRING ((-13813746.437 4994099.374, -...",2597.663004


In [42]:
meters_to_miles = 0.000621371 #conversion scalar
rail_miles = unique_rail
rail_miles["railway_length_in_miles"] = rail_miles["length_in_meters"]*meters_to_miles
rail_miles.columns = rail_miles.columns.str.lower()
rail_miles=rail_miles.rename(columns={'uscb_countyfp':'countyfp'})
rail_miles

Unnamed: 0,objectid,countyfp,geometry,length_in_meters,railway_length_in_miles
0,1,037,"LINESTRING (-13161646.100 4024432.980, -131616...",1324.679871,0.823118
1,11,037,"LINESTRING (-13161646.278 4024433.978, -131616...",1.013509,0.000630
2,1842,037,"LINESTRING (-13161384.395 4023132.599, -131613...",9.924508,0.006167
3,1844,037,"LINESTRING (-13161401.790 4023254.582, -131614...",123.558518,0.076776
4,1,037,"LINESTRING (-13161404.154 4023130.614, -131612...",1289.190452,0.801066
...,...,...,...,...,...
5373,2368,037,"LINESTRING (-13154867.442 4026154.428, -131548...",919.563095,0.571390
5374,2372,067,"LINESTRING (-13515225.038 4669509.879, -135152...",727.361209,0.451961
5375,2379,073,"LINESTRING (-13034701.639 3842484.098, -130346...",454.623650,0.282490
5376,2404,023,"MULTILINESTRING ((-13813746.437 4994099.374, -...",2597.663004,1.614112


Look at Los Angeles rows that are to be summed

In [43]:
la_before_summing = rail_miles[rail_miles['countyfp'] == '037']

la_before_summing

Unnamed: 0,objectid,countyfp,geometry,length_in_meters,railway_length_in_miles
0,1,037,"LINESTRING (-13161646.100 4024432.980, -131616...",1324.679871,0.823118
1,11,037,"LINESTRING (-13161646.278 4024433.978, -131616...",1.013509,0.000630
2,1842,037,"LINESTRING (-13161384.395 4023132.599, -131613...",9.924508,0.006167
3,1844,037,"LINESTRING (-13161401.790 4023254.582, -131614...",123.558518,0.076776
4,1,037,"LINESTRING (-13161404.154 4023130.614, -131612...",1289.190452,0.801066
...,...,...,...,...,...
5140,2222,037,"LINESTRING (-13136971.951 4021044.068, -131368...",2349.712814,1.460043
5144,2224,037,"LINESTRING (-13141600.341 4022990.745, -131415...",444.493662,0.276195
5145,2342,037,"MULTILINESTRING ((-13141164.632 4022902.921, -...",538.893521,0.334853
5364,2342,037,"LINESTRING (-13140299.234 4022725.053, -131398...",1979.724965,1.230144


In [44]:
# Sum the mileage by county
sum_county_rail_miles = rail_miles.groupby('countyfp')['railway_length_in_miles'].sum().reset_index()

# Merge with ca_tract_county DataFrame
rail_milage_tracts = pd.merge(ca_tract_county, sum_county_rail_miles, on='countyfp', how='left')

# Display the result
rail_milage_tracts.head()

Unnamed: 0,tract,countyfp,county,railway_length_in_miles
0,6085504321,85,santa clara,130.876711
1,6085504410,85,santa clara,130.876711
2,6085507003,85,santa clara,130.876711
3,6085507004,85,santa clara,130.876711
4,6085502204,85,santa clara,130.876711


Looking at Los Angeles sums

In [45]:
la_before_summing = rail_milage_tracts[rail_milage_tracts['countyfp'] == '037']
la_before_summing.head()

Unnamed: 0,tract,countyfp,county,railway_length_in_miles
223,6037137000,37,los angeles,576.110727
415,6037541605,37,los angeles,576.110727
416,6037541801,37,los angeles,576.110727
427,6037541802,37,los angeles,576.110727
428,6037542000,37,los angeles,576.110727


In [46]:
rail_milage_tracts

Unnamed: 0,tract,countyfp,county,railway_length_in_miles
0,06085504321,085,santa clara,130.876711
1,06085504410,085,santa clara,130.876711
2,06085507003,085,santa clara,130.876711
3,06085507004,085,santa clara,130.876711
4,06085502204,085,santa clara,130.876711
...,...,...,...,...
9124,06059001303,059,orange,130.171910
9125,06059001304,059,orange,130.171910
9126,06059001401,059,orange,130.171910
9127,06013367200,013,contra costa,165.551765


In [47]:
rail_milage_tracts.to_csv('built_transportation_rail_metric.csv', index=False)

### Function call to upload to AWS

In [48]:
@append_metadata
def transportation_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared transportation metric csvs to S3 bucket. Uploaded files are from the following metrics:
    * number of airports per county
    * number of bridges per county
    * number of road bottlenecks per county
    * number of miles of highway per county
    * number of miles of freight rails per county

    Data for all transportation metrics was sourced from California Department of Transportation at:
    https://gisdata-caltrans.opendata.arcgis.com/

    Methods
    -------
    Duplicate data were removed based on the metric data's location and metric identifier(s).
    Relevant metric columns were isolated.
    Metrics that calculated milage estimated length of that metric by identifying the length of an entries
    geometry columm, converting the length to estimated miles, and summing all entries within the same county.
    Metrics that calculated number of metrics per county grouped the data by county columns, and counted occurences.
    Data was then merged to California 2021 census tracts, with counts by county being retained for each tract.
    
    Parameters
    ----------
    input_csv: string
        csv economic data 
    export: True/False boolean
        False = will exclusively generate the resulting metadata files for each transportation metric
        True = will upload resulting df containing CAL CRAI transportation metrics to AWS

    Script
    ------
    built_transportation.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by removing duplicate rows and isolating relevant columns.')
    print('Data transformation: entries were summed per county for total number metrics.')
    print('Data transformation: estimated milage metrics were reprojected to EPSG:3857.')
    print('Data transformation: new columns calculated estimated milage per county metrics.')
    print('Data transformation: data was merged to California census tracts.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if export == False:
        print(f'{input_csv} uploaded to AWS.')

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [49]:
input_csv = ['built_transportation_airports_metric.csv',
            'built_transportation_bottleneck_metric.csv',
            'built_transportation_bridge_metric.csv',
            'built_transportation_highway_metric.csv',
            'built_transportation_rail_metric.csv'
            ]

varnames = [
    'built_caltrans_airports',
    'built_caltrans_road_bottlenecks',
    'built_caltrans_bridges',
    'built_caltrans_highways',
    'built_caltrans_rails'
]

for csv, var in zip(input_csv, varnames):
    transportation_upload(csv, export=True, varname='test')