## Built Transportation Metric Calculation
Each metric is calculated separately. Resulting csvs are simultaneously uploaded to S3 in a function call at the bottom of this notebook
* Metric 1: number of airports per county
* Metric 2: number of bridges per county
* Metric 3: number of road bottlenecks per county
* Metric 4: number of miles of highway per county
* Metric 5:  number of miles of freight rails per county

METRICS 4 & 5 need some more work, their values appear to be too high, the columns I used to calculate milage dont have clear units or what they are doing

In [3]:
import pandas as pd
import os
import sys
import boto3
import io
import geopandas as gpd

# suppress pandas purely educational warnings
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.file_helpers import pull_gpkg_from_directory, upload_csv_aws, filter_counties
from scripts.utils.write_metadata import append_metadata

In [28]:
# read in CA census tiger file
census_shp_dir = "s3://ca-climate-index/0_map_data/2021_tiger_census_tract/2021_ca_tract/"
ca_boundaries = gpd.read_file(census_shp_dir)

# Create a copy of the relevant columns from ca_boundaries
filtered_ca_boundaries = ca_boundaries[['GEOID', 'geometry']].copy()

# Rename the 'GEOID' column to 'tract'
filtered_ca_boundaries.rename(columns={'GEOID': 'tract'}, inplace=True)

# Remove the first character from the 'tract' column
filtered_ca_boundaries['tract'] = filtered_ca_boundaries['tract'].str[1:]

# Display the resulting DataFrame
filtered_ca_boundaries

Unnamed: 0,tract,geometry
0,6085504321,"POLYGON ((-121.87556 37.39924, -121.87535 37.3..."
1,6085504410,"POLYGON ((-121.88886 37.40758, -121.88576 37.4..."
2,6085507003,"POLYGON ((-122.02489 37.21683, -122.02459 37.2..."
3,6085507004,"POLYGON ((-121.99304 37.22562, -121.99249 37.2..."
4,6085502204,"POLYGON ((-121.93167 37.29803, -121.92801 37.3..."
...,...,...
9124,6059001303,"POLYGON ((-117.95917 33.92458, -117.95888 33.9..."
9125,6059001304,"POLYGON ((-117.95918 33.92820, -117.95831 33.9..."
9126,6059001401,"POLYGON ((-117.95056 33.94503, -117.95055 33.9..."
9127,6013367200,"POLYGON ((-122.34551 37.96355, -122.34550 37.9..."


In [124]:
# read in CA census tiger file
ca_tract_county = "s3://ca-climate-index/0_map_data/ca_tracts_county.csv"
ca_tract_county = gpd.read_file(ca_tract_county)
ca_tract_county = ca_tract_county.drop(columns={'field_1', 'geometry'})
ca_tract_county.columns = ca_tract_county.columns.str.lower()
ca_tract_county = ca_tract_county.applymap(lambda s: s.lower() if type(s) == str else s)

ca_tract_county

Unnamed: 0,tract,countyfp,county
0,06085504321,085,santa clara
1,06085504410,085,santa clara
2,06085507003,085,santa clara
3,06085507004,085,santa clara
4,06085502204,085,santa clara
...,...,...,...
9124,06059001303,059,orange
9125,06059001304,059,orange
9126,06059001401,059,orange
9127,06013367200,013,contra costa


# Function to help fact check results for the first three metrics

In [99]:
def county_count(df, county_col, county, counter_list):
    county_isolate = df[df[county_col] == county]
    county_isolate_drop_duplicates = county_isolate.drop_duplicates(subset=[county_col] + counter_list)
    print(f'Length of df for {county} county without dropping duplicates: {len(county_isolate)}')
    print(f'Length of df for {county} county after dropping duplicates: {len(county_isolate_drop_duplicates)}')


In [81]:
# pull csv from aws
bucket_name = 'ca-climate-index'
aws_dir = '2b_reproject/built_environment/transportation/cdot/'

pull_gpkg_from_directory(bucket_name, aws_dir)

Saved GeoPackage as 'built_caltrans_airports.gpkg' locally
Saved GeoPackage as 'built_caltrans_bridges.gpkg' locally
Saved GeoPackage as 'built_caltrans_highways.gpkg' locally
Saved GeoPackage as 'built_caltrans_rails.gpkg' locally
Saved GeoPackage as 'built_caltrans_road_bottlenecks.gpkg' locally


In [9]:
airport_data = gpd.read_file('built_caltrans_airports.gpkg')
bridge_data = gpd.read_file('built_caltrans_bridges.gpkg')
highway_data = gpd.read_file('built_caltrans_highways.gpkg')
bottleneck_data = gpd.read_file('built_caltrans_road_bottlenecks.gpkg')
freight_rail_data = gpd.read_file('built_caltrans_rails.gpkg')

## Metric 1: Number of Airports per CA County

In [19]:
airport_data.columns

Index(['OBJECTID', 'District', 'Rank', 'County', 'Name', 'Type', 'Shift',
       'Fwy', 'Abs_PM', 'CA_PM', 'Number_Day', 'Avg_Extent', 'Total_Dela',
       'Direction', 'Abs_PM_ups', 'Shape_Leng', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [51]:
airport_data

Unnamed: 0,OBJECTID,FACILITY,MAPLABEL,AIRPORTID,CITY,COUNTY,FNCTNLCLSS,STATECLASS,FAASRVCLVL,FAASITENO,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,adin airport,adin,a26,adin,modoc,limited use,limited use,not a npias facility,01217.*a,...,06049000200,2,census tract 2,g5020,s,2409364166,66821707,+41.5108055,-121.2486404,POINT (-120.95439 41.18650)
1,221,tulelake airport,tulelake,o81,tulelake,modoc,community,community-agriculture,general aviation,02375.*a,...,06049000200,2,census tract 2,g5020,s,2409364166,66821707,+41.5108055,-121.2486404,POINT (-121.36067 41.89064)
2,2,agua caliente springs airport,agua caliente springs,l54,agua caliente springs,san diego,limited use,limited use,not a npias facility,01218.*a,...,06073021001,210.01,census tract 210.01,g5020,s,1635671050,7637,+32.9674126,-116.2802017,POINT (-116.29472 32.95570)
3,3,agua dulce airpark,agua dulce,l70,agua dulce,los angeles,limited use,limited use,not a npias facility,01219.*a,...,06037910814,9108.14,census tract 9108.14,g5020,s,80230866,27686,+34.5159420,-118.3096339,POINT (-118.31464 34.50257)
4,4,alpine county airport,alpine county,m45,markleeville,alpine,limited use,limited use,not a npias facility,01866.4*a,...,06003010000,100,census tract 100,g5020,s,1912292607,12557304,+38.6217831,-119.7983522,POINT (-119.76705 38.73474)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,238,woodlake airport,woodlake,o42,woodlake,tulare,community,community,general aviation,02463.51*a,...,06107000701,7.01,census tract 7.01,g5020,s,40266317,1219358,+36.3946939,-119.0868717,POINT (-119.10680 36.39877)
238,239,yolo county-davis woodland winters airport,yolo county,dwa,davis/woodland/winters,yolo,community,community-agriculture,general aviation,01488.*a,...,06113010505,105.05,census tract 105.05,g5020,s,136298933,1005494,+38.6029781,-121.7398412,POINT (-121.85695 38.57939)
239,240,yuba county airport,yuba county,myv,marysville,yuba,regional,regional-business/corporate,general aviation,01873.*a,...,06115040400,404,census tract 404,g5020,s,18228054,362202,+39.0999148,-121.5826165,POINT (-121.56983 39.09777)
240,241,yucca valley airport,yucca valley,l22,yucca valley,san bernardino,community,community,not a npias facility,02496.1*a,...,06071010428,104.28,census tract 104.28,g5020,s,49110952,0,+34.1547747,-116.4297791,POINT (-116.40689 34.12983)


Has county column, so running our filter county function
* One misspelled county, fix in the same cell

In [24]:
print('length of airport data before county filter:', len(airport_data))
airport_counties, omitted_airports = filter_counties(airport_data, 'COUNTY', county_list=None)
print('length of aiprort data after county filter:', len(airport_counties))
print('')

print('Omitted rows:')
omitted_airports = omitted_airports['COUNTY']
display(omitted_airports)
print('')

print('fixing spelling issue so airport data is ready for future steps:')
cleaned_airport_data = airport_data
cleaned_airport_data['COUNTY'] = airport_counties["COUNTY"].replace('Humbolt', 'Humboldt')
print('length of cleaned airport data:', len(cleaned_airport_data))

length of airport data before county filter: 242
length of aiprort data after county filter: 241

Omitted rows:


189    Humbolt
Name: COUNTY, dtype: object


fixing spelling issue so airport data is ready for future steps:
length of cleaned airport data: 242


In [36]:
# Convert all string columns to lowercase
str_columns = cleaned_airport_data.select_dtypes(include=['object']).columns
for col in str_columns:
    cleaned_airport_data[col] = cleaned_airport_data[col].str.lower()

# Isolate to relevant columns and drop duplicate rows with same airport and county
columns_to_keep = ['AIRPORTID','COUNTY']
cleaned_airport_data = cleaned_airport_data[columns_to_keep]
unique_airports = cleaned_airport_data.drop_duplicates(subset=['COUNTY', 'AIRPORTID'])

cleaned_airport_data

Unnamed: 0,AIRPORTID,COUNTY
0,a26,modoc
1,o81,modoc
2,l54,san diego
3,l70,los angeles
4,m45,alpine
...,...,...
237,o42,tulare
238,dwa,yolo
239,myv,yuba
240,l22,san bernardino


In [46]:
county_count_airports = cleaned_airport_data.groupby('COUNTY')['AIRPORTID'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_airports = county_count_airports.rename(columns={'AIRPORTID':'number_of_airports', 'COUNTY':'county'})
county_count_airports.head()

Unnamed: 0,county,number_of_airports
0,alameda,3
1,alpine,1
2,amador,1
3,butte,2
4,calaveras,1


Merge California tract/county data with airport county counts

In [47]:
airport_count_tract = pd.merge(ca_tract_county, county_count_airports, on='county', how='left')
airport_count_tract

Unnamed: 0,tract,countyfp,county,number_of_airports
0,06085504321,085,santa clara,4.0
1,06085504410,085,santa clara,4.0
2,06085507003,085,santa clara,4.0
3,06085507004,085,santa clara,4.0
4,06085502204,085,santa clara,4.0
...,...,...,...,...
9124,06059001303,059,orange,2.0
9125,06059001304,059,orange,2.0
9126,06059001401,059,orange,2.0
9127,06013367200,013,contra costa,2.0


Can fact check the results with our county count function
* use original airport dataframe and airportid column

In [103]:
county_count(airport_data, 'COUNTY', 'los angeles', ['AIRPORTID'])

Length of df for los angeles county without dropping duplicates: 14
Length of df for los angeles county after dropping duplicates: 14


Save as a csv for future upload to S3

In [55]:
airport_count_tract.to_csv('built_transportation_airports_metric.csv', index=False)

## Metric Number 2: Number of bridges per CA county

In [58]:
bridge_data.columns

Index(['OBJECTID', 'DIST', 'CO', 'RTE', 'PM', 'BRIDGE', 'BRIDGE_X', 'BRIDGE_Y',
       'CITY', 'LAT', 'LON', 'NAME', 'LOC', 'YRBLT', 'HST', 'FAC', 'APWID',
       'LENG', 'DK_AREA', 'LSW', 'RSW', 'RDW', 'REFVCU', 'VCU', 'MAINSPANS',
       'DIR', 'PRINC', 'INTERSEC', 'AADT', 'PCTTRK', 'DEF', 'NHS',
       'FUNCTIONAL', 'DATA_EXTRA', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [60]:
# Convert all string columns to lowercase
str_columns = bridge_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bridge_data[col] = bridge_data[col].str.lower()

columns_to_keep = ['BRIDGE','USCB_COUNTYFP']
cleaned_bridge_data = bridge_data[columns_to_keep]
unique_bridges = cleaned_bridge_data.drop_duplicates(subset=['USCB_COUNTYFP', 'BRIDGE'])

unique_bridges

Unnamed: 0,BRIDGE,USCB_COUNTYFP
0,01c0001,015
1,01c0002,015
2,01c0011,015
3,01c0012,015
4,01c0020,015
...,...,...
13327,58c0150,025
13328,58c0161,025
13329,58c0211,025
13330,58c0215,025


In [64]:
county_count_bridges = unique_bridges.groupby('USCB_COUNTYFP')['BRIDGE'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bridges = county_count_bridges.rename(columns={'BRIDGE':'numbers_of_bridges', 'USCB_COUNTYFP':'countyfp'})
county_count_bridges.head()

Unnamed: 0,countyfp,numbers_of_bridges
0,1,302
1,3,12
2,5,42
3,7,306
4,9,69


In [65]:
bridge_count_tracts = pd.merge(ca_tract_county, county_count_bridges, on='countyfp', how='left')
bridge_count_tracts

Unnamed: 0,tract,countyfp,county,numbers_of_bridges
0,06085504321,085,santa clara,508
1,06085504410,085,santa clara,508
2,06085507003,085,santa clara,508
3,06085507004,085,santa clara,508
4,06085502204,085,santa clara,508
...,...,...,...,...
9124,06059001303,059,orange,586
9125,06059001304,059,orange,586
9126,06059001401,059,orange,586
9127,06013367200,013,contra costa,362


In [102]:
county_count(bridge_data, 'USCB_COUNTYFP', '037', ['BRIDGE'])

Length of df for 037 county without dropping duplicates: 1764
Length of df for 037 county after dropping duplicates: 1764


In [70]:
bridge_count_tracts.to_csv('built_transportation_bridge_metric.csv', index=False)

## Metric number 3: Number of bottleneck areas per county

In [71]:
bottleneck_data.columns

Index(['OBJECTID', 'District', 'Rank', 'County', 'Name', 'Type', 'Shift',
       'Fwy', 'Abs_PM', 'CA_PM', 'Number_Day', 'Avg_Extent', 'Total_Dela',
       'Direction', 'Abs_PM_ups', 'Shape_Leng', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [72]:
bottleneck_data

Unnamed: 0,OBJECTID,District,Rank,County,Name,Type,Shift,Fwy,Abs_PM,CA_PM,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,3,1,PLA,EB Douglas Blvd,ML,PM,I80-W,103.38,1.876,...,06061021003,210.03,Census Tract 210.03,G5020,S,7618633,0,+38.7645705,-121.2757092,"LINESTRING (-121.25474 38.76627, -121.25586 38..."
1,1,3,1,PLA,EB Douglas Blvd,ML,PM,I80-W,103.38,1.876,...,06061020806,208.06,Census Tract 208.06,G5020,S,2091983,0,+38.7453927,-121.2762314,"LINESTRING (-121.26306 38.75607, -121.26327 38..."
2,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,06067005402,54.02,Census Tract 54.02,G5020,S,5481123,238328,+38.5897818,-121.4320670,"MULTILINESTRING ((-121.44381 38.59654, -121.44..."
3,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,06067005502,55.02,Census Tract 55.02,G5020,S,2868054,0,+38.6046274,-121.4270026,"MULTILINESTRING ((-121.43873 38.60075, -121.43..."
4,2,3,2,SAC,EB Exposition Blvd,ML,PM,SR51-S,3.33,3.326,...,06067006202,62.02,Census Tract 62.02,G5020,S,1627293,0,+38.6163928,-121.4222802,"LINESTRING (-121.42688 38.61119, -121.42688 38..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,88,12,9,ORA,West OF GYPSUM,ML,PM,SR91,34.14,R15.793,...,06059021905,219.05,Census Tract 219.05,G5020,S,3537491,0,+33.8584329,-117.7734031,"LINESTRING (-117.75684 33.86878, -117.75437 33..."
400,88,12,9,ORA,West OF GYPSUM,ML,PM,SR91,34.14,R15.793,...,06059021924,219.24,Census Tract 219.24,G5020,S,77509066,1874512,+33.8110260,-117.6969462,"MULTILINESTRING ((-117.74458 33.87027, -117.74..."
401,89,12,10,ORA,RED ROBIN,ML,PM,I5,91.53,19.33,...,06059062653,626.53,Census Tract 626.53,G5020,S,7046994,0,+33.6364763,-117.7465014,"LINESTRING (-117.72897 33.63658, -117.72877 33..."
402,89,12,10,ORA,RED ROBIN,ML,PM,I5,91.53,19.33,...,06059062622,626.22,Census Tract 626.22,G5020,S,2904077,26675,+33.6155024,-117.7214001,"LINESTRING (-117.72037 33.62781, -117.72036 33..."


As bottlenecks are a bit unique, I selected a few columns when removing duplicates
* If county, direction of traffic, rank of bottleneck, and time of day are all the same between rows,
one will be removed

I also checked number of duplicates for all of the columns to keep and there are no missing data

In [83]:
# Convert all string columns to lowercase
str_columns = bottleneck_data.select_dtypes(include=['object']).columns
for col in str_columns:
    bottleneck_data[col] = bottleneck_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','Rank', 'Direction', 'Shift', 'Name']
cleaned_bottleneck_data = bottleneck_data[columns_to_keep]
unique_bottlenecks = cleaned_bottleneck_data.drop_duplicates(subset=['USCB_COUNTYFP', 'Rank', 'Direction', 'Shift'])


unique_bottlenecks

Unnamed: 0,USCB_COUNTYFP,Rank,Direction,Shift,Name
0,061,1,w,pm,eb douglas blvd
2,067,2,s,pm,eb exposition blvd
5,067,3,s,pm,99sb at cosumnes (calvine rd)
8,067,4,e,pm,16th street
10,115,5,e,pm,70eb yuba river br
...,...,...,...,...,...
384,059,6,s,am,brookhurst2
392,059,7,s,am,main 1
394,059,8,n,pm,taft
399,059,9,e,pm,west of gypsum


In [115]:
county_count_bottlenecks = unique_bottlenecks.groupby('USCB_COUNTYFP')['Name'].apply(lambda x: x.notnull().sum()).reset_index()
county_count_bottlenecks = county_count_bottlenecks.rename(columns={'Name':'number_of_bottlenecks', 'USCB_COUNTYFP':'countyfp'})
county_count_bottlenecks

Unnamed: 0,countyfp,number_of_bottlenecks
0,1,5
1,13,2
2,17,1
3,19,7
4,29,1
5,37,9
6,39,2
7,53,4
8,59,10
9,61,2


In [116]:
bottleneck_count_tracts = pd.merge(ca_tract_county, county_count_bottlenecks, on='countyfp', how='left')
bottleneck_count_tracts

Unnamed: 0,tract,countyfp,county,number_of_bottlenecks
0,06085504321,085,santa clara,3.0
1,06085504410,085,santa clara,3.0
2,06085507003,085,santa clara,3.0
3,06085507004,085,santa clara,3.0
4,06085502204,085,santa clara,3.0
...,...,...,...,...
9124,06059001303,059,orange,10.0
9125,06059001304,059,orange,10.0
9126,06059001401,059,orange,10.0
9127,06013367200,013,contra costa,2.0


In [117]:
counter = ['Rank', 'Direction', 'Shift']
county_count(bottleneck_data, 'USCB_COUNTYFP', '013', counter_list=counter)

Length of df for 013 county without dropping duplicates: 7
Length of df for 013 county after dropping duplicates: 2


In [118]:
bottleneck_count_tracts.to_csv('built_transportation_bottleneck_metric.csv', index=False)

## Metric number 4: number of miles of highway per county
* This one is tricky, the data lacks information on the units for each column
    * I am guessing columns 'FromARMeas' and 'ToARMeasur' are likely start to end distances, I tested a few from
    the source's maps and drawing a similar path in maps to think it is in mile units
    * So the difference between the two should be the estimated mile length of the highway
    * I then sum the estimated length per county
    * BUUUT the resulting values are pretty high, even if assuming theyre in km, the values are still a bit high


In [120]:
highway_data.columns

Index(['OBJECTID', 'RouteID', 'FromARMeas', 'ToARMeasur', 'NHS_TYPE',
       'GlobalID', 'Shape_Leng', 'USCB_STATEFP', 'USCB_COUNTYFP',
       'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME', 'USCB_NAMELSAD',
       'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND', 'USCB_AWATER',
       'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [257]:
# Convert all string columns to lowercase
str_columns = highway_data.select_dtypes(include=['object']).columns
for col in str_columns:
    highway_data[col] = highway_data[col].str.lower()

columns_to_keep = ['USCB_COUNTYFP','RouteID', 'FromARMeas', 'ToARMeasur']
cleaned_highway_data = highway_data[columns_to_keep]
unique_highway = cleaned_highway_data.drop_duplicates(subset=['USCB_COUNTYFP', 'RouteID', 'FromARMeas', 'ToARMeasur'])

unique_highway

Unnamed: 0,USCB_COUNTYFP,RouteID,FromARMeas,ToARMeasur
0,013,shs_004._p,35.883060,61.945581
1,013,cc_co_byron hwy_p,6.688033,7.617485
2,013,cc_co_brentwood blvd_p,0.000000,6.396113
3,013,cc_co_brentwood blvd_s,0.000000,0.450931
4,013,cc_co_brentwood blvd_s,0.451931,0.641661
...,...,...,...,...
25537,063,shs_036._s,184.851659,247.500977
25538,063,shs_036._p,184.836363,247.477704
25556,017,shs_049._p,170.709764,173.590016
25557,017,shs_049._s,170.717930,173.598039


In [258]:
highway_miles = unique_highway
highway_miles['est_len_miles?'] = (highway_miles['ToARMeasur'] - highway_miles['FromARMeas'])#.abs()

highway_miles.columns = highway_miles.columns.str.lower()

highway_miles=highway_miles.rename(columns={'uscb_countyfp':'countyfp'})

highway_miles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  highway_miles['est_len_miles?'] = (highway_miles['ToARMeasur'] - highway_miles['FromARMeas'])#.abs()


Unnamed: 0,countyfp,routeid,fromarmeas,toarmeasur,est_len_miles?
0,013,shs_004._p,35.883060,61.945581,26.062520
1,013,cc_co_byron hwy_p,6.688033,7.617485,0.929452
2,013,cc_co_brentwood blvd_p,0.000000,6.396113,6.396113
3,013,cc_co_brentwood blvd_s,0.000000,0.450931,0.450931
4,013,cc_co_brentwood blvd_s,0.451931,0.641661,0.189730
...,...,...,...,...,...
25537,063,shs_036._s,184.851659,247.500977,62.649318
25538,063,shs_036._p,184.836363,247.477704,62.641341
25556,017,shs_049._p,170.709764,173.590016,2.880252
25557,017,shs_049._s,170.717930,173.598039,2.880109


### to check if the math is correct, I use county fip 17 (el dorado) to visualize the estimated length
* the est_len_miles column is then summed by county and displayed below

In [254]:
el_dorado_before_summing = highway_miles[highway_miles['countyfp'] == '017']
el_dorado_before_summing

Unnamed: 0,countyfp,routeid,fromarmeas,toarmeasur,est_len_miles?
8824,17,shs_088._p,39.63695,121.651442,82.014492
8825,17,shs_088._s,39.638456,121.642485,82.004028
8826,17,shs_050._p,5.679311,108.475857,102.796546
8827,17,shs_050._s,5.472215,108.243377,102.771162
8844,17,shs_089._s,28.83048,39.976988,11.146508
8845,17,shs_089._p,28.845839,39.992347,11.146508
14319,17,ed_co_lake tahoe blvd_p,1.586408,3.822258,2.23585
14324,17,ed_co_lake tahoe blvd_s,0.292112,1.239495,0.947383
14327,17,ed_co_silva valley pkwy_p,0.0,0.274521,0.274521
14331,17,sac_rcdv_white rock rd_p,1.093347,13.353136,12.259789


## Sum the estimated length of highways by county

In [259]:
# Sum the mileage by county
sum_county_milage = highway_miles.groupby('countyfp')['est_len_miles?'].sum().reset_index()

# Merge with ca_tract_county DataFrame
highway_milage_tracts = pd.merge(ca_tract_county, sum_county_milage, on='countyfp', how='left')

# Display the result
highway_milage_tracts


Unnamed: 0,tract,countyfp,county,est_len_miles?
0,06085504321,085,santa clara,2173.940189
1,06085504410,085,santa clara,2173.940189
2,06085507003,085,santa clara,2173.940189
3,06085507004,085,santa clara,2173.940189
4,06085502204,085,santa clara,2173.940189
...,...,...,...,...
9124,06059001303,059,orange,2104.263677
9125,06059001304,059,orange,2104.263677
9126,06059001401,059,orange,2104.263677
9127,06013367200,013,contra costa,1046.399880


In [250]:
el_dorado_after_summing = highway_milage_tracts[highway_milage_tracts['countyfp'] == '017']
el_dorado_after_summing.head()

Unnamed: 0,tract,countyfp,county,est_len_miles?
163,6017030810,17,el dorado,421.445757
164,6017030301,17,el dorado,421.445757
165,6017030302,17,el dorado,421.445757
166,6017030808,17,el dorado,421.445757
167,6017030809,17,el dorado,421.445757


In [170]:
highway_milage_tracts.to_csv('built_transportation_highway_metric.csv', index=False)

## Metric number 5: Number of miles of rail tracks per county
* This one is also tricky, the data lacks information on the units for each column
    * I am guessing columns 'BEGIN_MP' and 'END_MP' are similar to highway distances (likely standing for mile posts)
    * So the difference between the two should be the estimated mile length of the rail
    * I then sum the estimated length per county
    * LA has a pretty high number, so this likely needs work

In [171]:
freight_rail_data.columns

Index(['OBJECTID', 'BEGIN_MP', 'END_MP', 'ROW_OWNER', 'FREIGHT_OP',
       'SUBDIVISIO', 'SHRTLN_NAM', 'SHRTLN_COD', 'PASS_OP', 'PASS_NETWO',
       'COMM_OP', 'COMM_NETWO', 'RECR_OP', 'RECR_NETWO', 'STATUS', 'RR_CLSS',
       'TRK_CLSS', 'PASS_SPEED', 'FRT_SPEED', 'FRT_DNS', 'NUM_TRACK',
       'STRACNET', 'CR63', 'SHAPE_LENG', 'Shape_Le_1', 'USCB_STATEFP',
       'USCB_COUNTYFP', 'USCB_TRACTCE', 'USCB_GEOID', 'USCB_NAME',
       'USCB_NAMELSAD', 'USCB_MTFCC', 'USCB_FUNCSTAT', 'USCB_ALAND',
       'USCB_AWATER', 'USCB_INTPTLAT', 'USCB_INTPTLON', 'geometry'],
      dtype='object')

In [172]:
freight_rail_data

Unnamed: 0,OBJECTID,BEGIN_MP,END_MP,ROW_OWNER,FREIGHT_OP,SUBDIVISIO,SHRTLN_NAM,SHRTLN_COD,PASS_OP,PASS_NETWO,...,USCB_GEOID,USCB_NAME,USCB_NAMELSAD,USCB_MTFCC,USCB_FUNCSTAT,USCB_ALAND,USCB_AWATER,USCB_INTPTLAT,USCB_INTPTLON,geometry
0,1,4.1,7.9,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,06037535300,5353,Census Tract 5353,G5020,S,1080457,0,+33.9615844,-118.2361363,"LINESTRING (-118.23308 33.96746, -118.23291 33..."
1,11,0.4,4.1,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,06037535300,5353,Census Tract 5353,G5020,S,1080457,0,+33.9615844,-118.2361363,"LINESTRING (-118.23308 33.96747, -118.23308 33..."
2,1842,15.9,17.1,UP,UP,Patata Industrial Lead,,,,,...,06037535300,5353,Census Tract 5353,G5020,S,1080457,0,+33.9615844,-118.2361363,"LINESTRING (-118.23073 33.95777, -118.23071 33..."
3,1844,15.5,15.9,UP,UP,Patata Industrial Lead,,,,,...,06037535300,5353,Census Tract 5353,G5020,S,1080457,0,+33.9615844,-118.2361363,"LINESTRING (-118.23088 33.95868, -118.23089 33..."
4,1,4.1,7.9,ACTA,"UP,BNSF",Alameda Corridor,,,,,...,06037535400,5354,Census Tract 5354,G5020,S,771064,0,+33.9537540,-118.2348542,"LINESTRING (-118.23091 33.95776, -118.22957 33..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5373,2368,0.0,0.0,BNSF,BNSF,Los Angeles Junction,Los Angeles Junction Railway,LAJ,,,...,06037533703,5337.03,Census Tract 5337.03,G5020,S,563320,0,+33.9837380,-118.1752709,"LINESTRING (-118.17218 33.98028, -118.17211 33..."
5374,2372,0.0,96.2,PRC,PRC,Sacramento Valley Railroad (SAV),Sacramento Valley Railroad (SAV),SAV,,,...,06067006400,64,Census Tract 64,G5020,S,3449978,0,+38.6417287,-121.4202442,"LINESTRING (-121.40933 38.63491, -121.40917 38..."
5375,2379,8.5,10.0,SDMTS,SDIY,Coronado Branch,San Diego & Imperial Valley,SDIY,,,...,06073010103,101.03,Census Tract 101.03,G5020,S,3340456,3447455,+32.5996071,-117.0958443,"LINESTRING (-117.09272 32.60116, -117.09264 32..."
5376,2404,292.5,295.6,NCRA,,Korblex Branch,Northwestern Pacific,NWP,,,...,06023001102,11.02,Census Tract 11.02,G5020,S,7339191,0,+40.8925053,-124.0969908,"MULTILINESTRING ((-124.09100 40.87620, -124.09..."


In [194]:
# Convert all string columns to lowercase
str_columns = freight_rail_data.select_dtypes(include=['object']).columns
for col in str_columns:
    freight_rail_data[col] = freight_rail_data[col].str.lower()

columns_to_keep = ['OBJECTID','USCB_COUNTYFP', 'BEGIN_MP', 'END_MP']
cleaned_rail_data = freight_rail_data[columns_to_keep]
unique_rail = cleaned_rail_data.drop_duplicates(subset=['OBJECTID', 'USCB_COUNTYFP', 'BEGIN_MP', 'END_MP'])

unique_rail

Unnamed: 0,OBJECTID,USCB_COUNTYFP,BEGIN_MP,END_MP
0,1,037,4.1,7.9
1,11,037,0.4,4.1
2,1842,037,15.9,17.1
3,1844,037,15.5,15.9
8,1843,037,17.1,21.7
...,...,...,...,...
5351,2337,025,719.9,721.6
5352,2340,025,725.8,729.1
5353,2335,071,543.1,546.5
5360,2338,025,729.1,732.1


## Some values were negative after subtracting begin milepoint from end milepoint, so I got the absolute value of the difference between the two

In [217]:
rail_miles = unique_rail
rail_miles['est_len_miles?'] = (unique_rail['END_MP'] - unique_rail['BEGIN_MP']).abs()  # Use .abs() for absolute difference
rail_miles=rail_miles.rename(columns={'USCB_COUNTYFP':'countyfp'})
rail_miles.columns = rail_miles.columns.str.lower()

rail_miles.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rail_miles['est_len_miles?'] = (unique_rail['END_MP'] - unique_rail['BEGIN_MP']).abs()  # Use .abs() for absolute difference


Unnamed: 0,objectid,countyfp,begin_mp,end_mp,est_len_miles?
0,1,37,4.1,7.9,3.8
1,11,37,0.4,4.1,3.7
2,1842,37,15.9,17.1,1.2
3,1844,37,15.5,15.9,0.4
8,1843,37,17.1,21.7,4.6


Look at Los Angeles rows that are to be summed

In [230]:
la_before_summing = rail_miles[rail_miles['countyfp'] == '037']

la_before_summing

Unnamed: 0,objectid,countyfp,begin_mp,end_mp,est_len_miles?
0,1,037,4.1,7.9,3.8
1,11,037,0.4,4.1,3.7
2,1842,037,15.9,17.1,1.2
3,1844,037,15.5,15.9,0.4
8,1843,037,17.1,21.7,4.6
...,...,...,...,...,...
4910,2071,037,496.8,497.3,0.5
4913,2072,037,500.7,501.0,0.3
4918,2070,037,495.6,496.8,1.2
5136,2222,037,500.8,503.7,2.9


In [221]:
# Sum the mileage by county
sum_county_rail_miles = rail_miles.groupby('countyfp')['est_len_miles?'].sum().reset_index()

# Merge with ca_tract_county DataFrame
rail_milage_tracts = pd.merge(ca_tract_county, sum_county_rail_miles, on='countyfp', how='left')

# Display the result
rail_milage_tracts.head()


Unnamed: 0,tract,countyfp,county,est_len_miles?
0,6085504321,85,santa clara,109.5
1,6085504410,85,santa clara,109.5
2,6085507003,85,santa clara,109.5
3,6085507004,85,santa clara,109.5
4,6085502204,85,santa clara,109.5


Looking at Los Angeles sums

In [222]:
la_before_summing = rail_milage_tracts[rail_milage_tracts['countyfp'] == '037']

la_before_summing.head()

Unnamed: 0,tract,countyfp,county,est_len_miles?
223,6037137000,37,los angeles,2889.1
415,6037541605,37,los angeles,2889.1
416,6037541801,37,los angeles,2889.1
427,6037541802,37,los angeles,2889.1
428,6037542000,37,los angeles,2889.1


In [225]:
rail_milage_tracts

Unnamed: 0,tract,countyfp,county,est_len_miles?
0,06085504321,085,santa clara,109.5
1,06085504410,085,santa clara,109.5
2,06085507003,085,santa clara,109.5
3,06085507004,085,santa clara,109.5
4,06085502204,085,santa clara,109.5
...,...,...,...,...
9124,06059001303,059,orange,117.1
9125,06059001304,059,orange,117.1
9126,06059001401,059,orange,117.1
9127,06013367200,013,contra costa,135.9


In [226]:
rail_milage_tracts.to_csv('built_transportation_rail_metric.csv', index=False)

## Function call to upload to AWS

In [228]:
@append_metadata
def transportation_upload(input_csv, export=False, varname=''):
    '''
    Uploads prepared transportation metric csvs to S3 bucket. Uploaded files are from the following metrics:
    * number of airports per county
    * number of bridges per county
    * number of road bottlenecks per county
    * number of miles of highway per county
    * number of miles of freight rails per county

    Data for all transportation metrics was sourced from California Department of Transportation at:
    https://gisdata-caltrans.opendata.arcgis.com/

    Methods
    -------
    Duplicate data were removed based on the metric data's location and metric identifier(s).
    Relevant metric columns were isolated.
    Metrics that calculated milage estimated length of that metric by substracting 'end mile points' from 'start mile points'.
    Metrics that calculated number of metrics per county grouped the data by county columns, and counted occurences.
    Data was then merged to California 2021 census tracts, with counts by county being retained for each tract.
    
    Parameters
    ----------
    input_csv: string
        csv economic data 
    export: True/False boolean
        False = will not upload resulting df containing CAL CRAI transportation metrics to AWS
        True = will upload resulting df containing CAL CRAI transportation metrics to AWS

    Script
    ------
    built_transportation.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    '''
    print('Data transformation: data cleaned by removing duplicate rows and isolating relevant columns.')
    print('Data transformation: entries were summed per county for total number metrics.')
    print('Data transformation: new columns calculated estimated milage for distance per county metrics.')
    print('Data transformation: data was merged to California census tracts.')

    if export == True:
        bucket_name = 'ca-climate-index'
        directory = '3_fair_data/index_data'
        export_filename = [input_csv]
        upload_csv_aws(export_filename, bucket_name, directory)

    if os.path.exists(input_csv):
        os.remove(input_csv)

In [229]:
input_csv = ['built_transportation_airports_metric.csv',
            'built_transportation_bottleneck_metric.csv',
            'built_transportation_bridge_metric',
            'built_transportation_highway_metric',
            'built_transportation_rail_metric'
            ]

varnames = [
    'built_caltrans_airports',
    'built_caltrans_road_bottlenecks',
    'built_caltrans_bridges',
    'built_caltrans_highways',
    'built_caltrans_rails'
    
]

for csv, var in zip(input_csv, varnames):
    transportation_upload(csv, export=False, varname=var)