In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
import json
import fiona
import glob
from shapely.geometry import Point

from methods import read_shst_extract

In [2]:
data_interim_dir = "../../data/interim/"

In [3]:
link_file = data_interim_dir + "step3_join_shst_extraction_with_osm/" + "link.json"
with open(link_file) as f:
    link_json = json.load(f)
link_df = pd.DataFrame(link_json)

shape_gdf = gpd.read_file(data_interim_dir + "step3_join_shst_extraction_with_osm/" 
                          + "shape.geojson")

link_df = pd.merge(link_df,
                   shape_gdf[["id", "geometry"]],
                   how = "left",
                   left_on = "shstGeometryId",
                   right_on = "id")

link_df = gpd.GeoDataFrame(link_df, geometry = link_df["geometry"],
                            crs={"init" : "epsg:4326"})

In [4]:
conflation_df = pd.read_csv("../../data/interim/conflation_result.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
conflation_df.columns

Index(['shstReferenceId', 'roadway', 'lanes_osm', 'drive_access',
       'bike_access', 'walk_access', 'tomtom_FRC', 'tomtom_FRC_def',
       'tomtom_lanes', 'tomtom_unique_id', 'F_JNCTID', 'T_JNCTID',
       'tomtom_name', 'tomtom_shieldnum', 'tomtom_rtedir', 'TM2Marin_A',
       'TM2Marin_B', 'TM2Marin_FT', 'TM2Marin_LANES', 'TM2Marin_ASSIGNABLE',
       'TM2_A', 'TM2_B', 'TM2_FT', 'TM2_FT_def', 'TM2_LANES', 'TM2_ASSIGNABLE',
       'sfcta_A', 'sfcta_B', 'sfcta_STREETNAME', 'sfcta_FT', 'sfcta_LANE_AM',
       'sfcta_LANE_OP', 'sfcta_LANE_PM', 'PEMSID', 'pems_lanes_FF',
       'pems_lanes_FR', 'pems_lanes_HV', 'pems_lanes_ML', 'pems_lanes_OR'],
      dtype='object')

In [6]:
link_df = pd.merge(link_df,
                  conflation_df[["shstReferenceId", 'tomtom_shieldnum', 'tomtom_rtedir']],
                  how = "left",
                  on = 'shstReferenceId')

In [7]:
link_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1705772 entries, 0 to 1705771
Data columns (total 35 columns):
access                object
area                  object
bike_access           int64
bridge                object
drive_access          int64
est_width             object
fromIntersectionId    object
highway               object
id_x                  object
junction              object
key                   object
landuse               object
lanes                 object
link                  object
maxspeed              object
name                  object
nodeIds               object
oneWay                object
ref                   object
roadway               object
roundabout            object
service               object
shstGeometryId        object
shstReferenceId       object
toIntersectionId      object
tunnel                object
u                     int64
v                     int64
walk_access           int64
wayId                 object
width         

In [8]:
pems_file = "../../data/external/mtc/pems_period.csv"

pems_df = pd.read_csv(pems_file)

In [9]:
pems_df["geometry"] = [Point(xy) for xy in zip(pems_df.longitude, pems_df.latitude)]

pems_gdf = gpd.GeoDataFrame(pems_df, geometry = pems_df["geometry"],
                            crs={"init" : "epsg:4326"})

pems_gdf = pems_gdf[~((pems_gdf.longitude.isnull()) | (pems_gdf.latitude.isnull()))]

# Write out pems for shst conflation

In [None]:
# keep unique

pems_gdf.drop_duplicates(subset = ["station", "longitude", "latitude"])[["station", "longitude", "latitude", "geometry"]].to_file("../../data/external/mtc/pems.in.geojson",
                                                driver = "GeoJSON")

# Prepare for nearest match

In [10]:
# convert crs to meter based

pems_gdf = pems_gdf.to_crs(epsg = 26915)

link_df = link_df.to_crs(epsg = 26915)

In [11]:
pems_gdf.shape

(76057, 23)

In [12]:
pems_gdf.route.value_counts().sum()

76057

In [13]:
link_df.sindex

<geopandas.sindex.SpatialIndex at 0x17b0ca21d30>

In [14]:
pems_gdf.iloc[2]

station                                               400000
district                                                   4
route                                                    101
direction                                                  S
type                                                      ML
time_period                                               EV
lanes                                                      3
median_flow                                             8523
avg_flow                                             8434.26
sd_flow                                              420.136
median_speed                                         67.3375
avg_speed                                            67.2283
sd_speed                                             0.69445
median_occup                                        0.026681
avg_occup                                           0.026709
sd_occupancy                                      0.00136179
days_observed           

In [15]:
link_df[link_df.tomtom_shieldnum == "101"][["tomtom_rtedir", "roadway"]]

Unnamed: 0,tomtom_rtedir,roadway
4970,S,motorway
6024,N,motorway
6125,S,motorway
6956,S,motorway
6968,S,motorway
8399,,primary
9141,N,motorway
9512,N,motorway
9548,,primary
10390,N,motorway


In [16]:
link_df[link_df.tomtom_shieldnum == "101"].roadway.value_counts()

motorway         570
primary          180
motorway_link     33
trunk             16
secondary          4
residential        2
primary_link       1
Name: roadway, dtype: int64

# Write out links that have shieldnum same as pems route

In [17]:
[c for c in pems_gdf.route.unique().astype(str) if c not in link_df.tomtom_shieldnum.unique()]

['948']

In [18]:
link_df.tomtom_shieldnum.unique()
pems_gdf.route.unique().astype(str)

array(['101', '80', '680', '280', '880', '580', '24', '4', '238', '85',
       '87', '17', '237', '242', '92', '1', '980', '25', '37', '948',
       '780', '84', '29', '156', '205', '380', '12', '152', '160'],
      dtype='<U21')

In [19]:
interest_facility_df = link_df[link_df.tomtom_shieldnum.isin(pems_gdf.route.unique().astype(str))]

In [20]:
interest_facility_df = interest_facility_df.to_crs(epsg = 4326)

In [None]:
interest_facility_df[
    ["roadway", "shstReferenceId", "geometry"]].to_file(data_interim_dir + "mtc/link_candidates_for_pems.geojson",
                            driver = "GeoJSON")

In [22]:
interest_facility_df.groupby(["tomtom_shieldnum", "tomtom_rtedir"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,access,area,bike_access,bridge,drive_access,est_width,fromIntersectionId,highway,id_x,junction,...,shstReferenceId,toIntersectionId,tunnel,u,v,walk_access,wayId,width,id_y,geometry
tomtom_shieldnum,tomtom_rtedir,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,,1205,1205,1205,1205,1205,1205,1205,1205,1205,1205,...,1205,1205,1205,1205,1205,1205,1205,1205,1205,1205
1,N,121,121,121,121,121,121,121,121,121,121,...,121,121,121,121,121,121,121,121,121,121
1,S,140,140,140,140,140,140,140,140,140,140,...,140,140,140,140,140,140,140,140,140,140
101,,205,205,205,205,205,205,205,205,205,205,...,205,205,205,205,205,205,205,205,205,205
101,N,284,284,284,284,284,284,284,284,284,284,...,284,284,284,284,284,284,284,284,284,284
101,S,317,317,317,317,317,317,317,317,317,317,...,317,317,317,317,317,317,317,317,317,317
12,,1007,1007,1007,1007,1007,1007,1007,1007,1007,1007,...,1007,1007,1007,1007,1007,1007,1007,1007,1007,1007
12,E,17,17,17,17,17,17,17,17,17,17,...,17,17,17,17,17,17,17,17,17,17
12,W,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
152,,388,388,388,388,388,388,388,388,388,388,...,388,388,388,388,388,388,388,388,388,388


In [23]:
pems_gdf.groupby(["route", "direction"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,district,type,time_period,lanes,median_flow,avg_flow,sd_flow,median_speed,avg_speed,...,median_occup,avg_occup,sd_occupancy,days_observed,state_pm,abs_pm,latitude,longitude,year,geometry
route,direction,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,N,35,35,35,35,35,35,35,35,35,35,...,35,35,35,35,35,35,35,35,35,35
1,S,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
4,E,1756,1756,1756,1756,1756,1756,1756,1756,1442,1442,...,1756,1756,1756,1756,1756,1756,1756,1756,1756,1756
4,W,1800,1800,1800,1800,1800,1800,1800,1800,1477,1477,...,1800,1800,1800,1800,1800,1800,1800,1800,1800,1800
12,E,110,110,110,110,110,110,110,110,110,110,...,110,110,110,110,110,110,110,110,110,110
12,W,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
17,N,700,700,700,700,700,700,700,700,445,445,...,700,700,700,700,700,700,700,700,700,700
17,S,439,439,439,439,439,439,439,439,284,284,...,439,439,439,439,439,439,439,439,439,439
24,E,1221,1221,1221,1221,1221,1221,1221,1221,1041,1041,...,1221,1221,1221,1221,1221,1221,1221,1221,1221,1221
24,W,1455,1455,1455,1455,1455,1455,1455,1455,1336,1336,...,1455,1455,1455,1455,1455,1455,1455,1455,1455,1455


In [24]:
link_df.tomtom_shieldnum.value_counts()

        862208
82        2703
1         1466
12        1042
84         849
101        806
185        768
123        739
116        716
128        611
238        514
35         513
29         507
152        388
130        387
G4         317
121        294
G2         278
80         272
580        252
J2         251
280        251
61         242
92         235
680        218
113        214
880        211
4          210
13         209
9          189
         ...  
131        125
237        122
G3         106
85         102
112         88
260         77
2063        74
37          73
G8          64
10          62
17          61
24          60
109         59
87          51
780         38
221         29
220         27
G9          27
93          24
25          20
505         19
262         19
242         15
980         14
G7          12
77          12
160          9
380          9
156          4
205          3
Name: tomtom_shieldnum, Length: 61, dtype: int64

In [26]:
pems_gdf[(pems_gdf.route == 101) & (pems_gdf.direction == "S")]["type"].value_counts()

ML    7112
OR    1287
FR     885
FF      60
Name: type, dtype: int64

In [27]:
pems_gdf[(pems_gdf["type"] == "HV")]

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,median_occup,avg_occup,sd_occupancy,days_observed,state_pm,abs_pm,latitude,longitude,year,geometry
59758,407332,4,85,N,HV,AM,1,3927.5,3898.166667,258.362514,...,0.084976,0.086073,0.011432,24,1439.0,23.886,37.407311,-122.069566,2015,POINT (-2098485.810211925 4561187.769523272)
59759,407332,4,85,N,HV,EA,1,474.5,472.041667,23.456675,...,0.014065,0.013993,0.001079,24,1439.0,23.886,37.407311,-122.069566,2015,POINT (-2098485.810211925 4561187.769523272)
59760,407332,4,85,N,HV,EV,1,1872.5,1862.409091,111.75128,...,0.026427,0.026462,0.001526,22,1439.0,23.886,37.407311,-122.069566,2015,POINT (-2098485.810211925 4561187.769523272)
59761,407332,4,85,N,HV,MD,1,4246.0,4265.304348,186.052152,...,0.05981,0.060197,0.003599,23,1439.0,23.886,37.407311,-122.069566,2015,POINT (-2098485.810211925 4561187.769523272)
59762,407332,4,85,N,HV,PM,1,1531.0,1511.695652,111.579745,...,0.026375,0.026147,0.002041,23,1439.0,23.886,37.407311,-122.069566,2015,POINT (-2098485.810211925 4561187.769523272)
59818,407359,4,85,S,HV,AM,1,1006.5,991.458333,58.596616,...,0.023288,0.023117,0.001375,24,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59819,407359,4,85,S,HV,EA,1,1.5,1.791667,1.559798,...,0.0001,9.9e-05,8.6e-05,24,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59820,407359,4,85,S,HV,EV,1,2763.5,2715.0,262.26831,...,0.054162,0.0538,0.007297,22,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59821,407359,4,85,S,HV,MD,1,2970.0,3014.391304,226.936262,...,0.046906,0.047906,0.00418,23,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59822,407359,4,85,S,HV,PM,1,3095.0,3111.695652,230.054625,...,0.061583,0.063031,0.006688,23,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)


# Match Pems to the nearest link that has same shieldnum and direction

In [28]:
pems_gdf["type"].value_counts()

ML    60319
OR     9309
FR     5733
FF      666
HV       30
Name: type, dtype: int64

In [29]:
# dictionary for pems type and roadway type
roadway_type_list = link_df.roadway.unique()

pems_type_list = pems_gdf["type"].unique()

pems_roadway_crosswalk = {"ML" : ['tertiary', 'primary', 'secondary', 'motorway', 'trunk'],
                          "HV" : [c for c in roadway_type_list if c.endswith("_link")],
                          "FF" : [c for c in roadway_type_list if c.endswith("_link")],
                          "OR" : [c for c in roadway_type_list if c.endswith("_link")],
                          "FR" : [c for c in roadway_type_list if c.endswith("_link")]
                         }

In [54]:
# match based on pems route, direction, and tomtom shieldnum, rtedir

offset = 100

pems_route_list = pems_gdf.route.unique().tolist()

pems_match_gdf = gpd.GeoDataFrame()

for route in pems_route_list:
#for route in [80]:
    
    print("pems route id {}".format(route))
    
    pems_subset_gdf = pems_gdf[(pems_gdf.route == route)].copy()
    
    dir_list = pems_subset_gdf.direction.unique().tolist()
    
    for direction in dir_list:
    #for direction in ["W"]:
        
        print("\t pems direction {}".format(direction))
        
        pems_subset_gdf = pems_gdf[(pems_gdf.route == route) & 
                                   (pems_gdf.direction == direction)].copy()
        
        type_list = pems_subset_gdf["type"].unique().tolist()
        
        for ptype in type_list:
        #for ptype in ["ML"]:
            
            print("\t\t pems type {}".format(ptype))
            
            pems_subset_gdf = pems_gdf[(pems_gdf.route == route) &
                                       (pems_gdf.direction == direction) &
                                       (pems_gdf["type"] == ptype)].copy()
            
            bbox = pems_subset_gdf.bounds + [-offset, -offset, offset, offset]
    
            line = link_df[(link_df.tomtom_shieldnum == str(route)) & 
                           (link_df.tomtom_rtedir == direction) &
                           (link_df.roadway.isin(pems_roadway_crosswalk[ptype])
                           )].copy()
            
            if len(line) == 0:
                print("\t\t there is no link with tomtom label direction {}, route {}, roadway {}, matching to the closest\
                      {}".format(
                    direction, route, ptype, pems_roadway_crosswalk[ptype]
                ))
                
                line = link_df[(link_df.roadway.isin(pems_roadway_crosswalk[ptype]))
                              ].copy()
        
            hits = bbox.apply(lambda row: list(line.sindex.intersection(row)),
                  axis = 1)
            
            tmp = pd.DataFrame({
                # index of points table
                "pt_idx": np.repeat(hits.index, hits.apply(len)),
                # ordinal position of line - access via iloc later
                "line_i": np.concatenate(hits.values)
            })
        
            # join with pems
        
            tmp.set_index(["pt_idx"], inplace = True)
        
            tmp = tmp.join(pems_subset_gdf[["station", "longitude", "latitude", "route", "direction", "type","geometry"]
                                          ].rename(
                                columns = {"geometry" : "point"}), 
                           how = "left")
            
            # join with links
        
            tmp.set_index(["line_i"], inplace = True)
        
            tmp = tmp.join(line[["shstReferenceId", "roadway","tomtom_shieldnum", "tomtom_rtedir", "geometry"]
                           ].reset_index(drop=True), 
                       how="left")
        
            # find closest line to point
        
            tmp = gpd.GeoDataFrame(tmp, geometry = tmp["geometry"], crs = pems_gdf.crs)
        
            tmp["snap_distance"]  = tmp.geometry.distance(gpd.GeoSeries(tmp.point))
        
            tmp.sort_values(by = ["snap_distance"], inplace = True)
        
            closest = tmp.groupby(["station", "longitude", "latitude"]).first().reset_index()
        
            pems_match_gdf = pd.concat([pems_match_gdf, closest],
                                       sort = False,
                                       ignore_index = True)

pems route id 101
	 pems direction S
		 pems type ML
		 pems type FR
		 pems type OR
		 pems type FF
	 pems direction N
		 pems type ML
		 pems type FR
		 pems type OR
		 pems type FF
pems route id 80
	 pems direction W
		 pems type ML
		 pems type FR
		 pems type FF
		 pems type OR
	 pems direction E
		 pems type ML
		 pems type FF
		 pems type OR
		 pems type FR
pems route id 680
	 pems direction N
		 pems type ML
		 pems type OR
		 there is no link with tomtom label direction N, route 680, roadway OR, matching to the closest                      ['primary_link', 'motorway_link', 'trunk_link', 'secondary_link', 'tertiary_link']
		 pems type FR
		 there is no link with tomtom label direction N, route 680, roadway FR, matching to the closest                      ['primary_link', 'motorway_link', 'trunk_link', 'secondary_link', 'tertiary_link']
		 pems type FF
		 there is no link with tomtom label direction N, route 680, roadway FF, matching to the closest                      ['primary

pems route id 780
	 pems direction E
		 pems type ML
	 pems direction W
		 pems type ML
pems route id 84
	 pems direction E
		 pems type ML
	 pems direction W
		 pems type ML
pems route id 29
	 pems direction N
		 pems type ML
		 pems type FR
		 pems type OR
	 pems direction S
		 pems type ML
		 pems type FR
		 pems type OR
pems route id 156
	 pems direction W
		 pems type ML
		 there is no link with tomtom label direction W, route 156, roadway ML, matching to the closest                      ['tertiary', 'primary', 'secondary', 'motorway', 'trunk']
	 pems direction E
		 pems type ML
		 there is no link with tomtom label direction E, route 156, roadway ML, matching to the closest                      ['tertiary', 'primary', 'secondary', 'motorway', 'trunk']
pems route id 205
	 pems direction E
		 pems type ML
	 pems direction W
		 pems type ML
pems route id 380
	 pems direction E
		 pems type ML
	 pems direction W
		 pems type ML
pems route id 12
	 pems direction E
		 pems type ML
	 pe

In [55]:
pems_match_gdf

Unnamed: 0,station,longitude,latitude,route,direction,type,point,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,geometry,snap_distance
0,400000.0,-122.547963,38.081498,101,S,ML,POINT (-2115334.47138468 4652792.447341145),df209cac09d05a2f7e0a22dc8e931df8,motorway,101,S,LINESTRING (-2115401.040652251 4652959.3807638...,2.867350
1,400000.0,-122.547606,38.081167,101,S,ML,POINT (-2115315.56035313 4652743.598747026),df209cac09d05a2f7e0a22dc8e931df8,motorway,101,S,LINESTRING (-2115401.040652251 4652959.3807638...,3.174880
2,400002.0,-122.328465,37.584097,101,S,ML,POINT (-2115131.410841958 4589330.976028608),dfd26b2508d008ad867b43ce2eae005f,motorway,101,S,LINESTRING (-2117606.566968283 4590672.7751965...,3.106433
3,400043.0,-122.403092,37.753591,101,S,ML,POINT (-2115244.798598309 4610955.231681113),6d8b2509ddc5a1a71b55ff5b569e136a,motorway,101,S,LINESTRING (-2114949.61504092 4612440.25763794...,2.475063
4,400069.0,-121.977493,37.385758,101,S,ML,POINT (-2090942.69189868 4555908.530250972),7dc48aa303c48fb6eea805245d5af370,motorway,101,S,LINESTRING (-2091431.901169152 4556300.2670397...,4.480192
5,400098.0,-122.359083,37.588702,101,S,ML,POINT (-2117726.824923707 4590807.944753751),78430402f0e1d13185c6afffd6adae54,motorway,101,S,LINESTRING (-2118058.793265421 4591315.5275253...,4.836421
6,400106.0,-122.332169,37.586325,101,S,ML,POINT (-2115380.128588265 4589699.913306671),dfd26b2508d008ad867b43ce2eae005f,motorway,101,S,LINESTRING (-2117606.566968283 4590672.7751965...,2.404988
7,400106.0,-122.330411,37.585602,101,S,ML,POINT (-2115249.019242223 4589562.902332258),dfd26b2508d008ad867b43ce2eae005f,motorway,101,S,LINESTRING (-2117606.566968283 4590672.7751965...,3.498507
8,400109.0,-121.889426,37.362706,101,S,ML,POINT (-2083815.400358296 4550591.954518915),b638c32df10b558df568f0ec0b5c5aa9,motorway,101,S,LINESTRING (-2084331.15084435 4550835.52599223...,2.448930
9,400116.0,-122.403213,37.634035,101,S,ML,POINT (-2119949.813978697 4597346.423843719),de5839cde1e16bbac638045817fb0fe3,motorway,101,S,LINESTRING (-2119948.934662843 4597881.8194357...,3.546147


In [56]:
pems_nearest_gdf = pd.merge(pems_gdf,
                            pems_match_gdf.drop(["point", "geometry"], axis = 1),
                            how = "left",
                            on = ["station", "longitude", "latitude", "route", "direction", "type"])

In [57]:
pems_gdf.shape

(76057, 23)

In [58]:
pems_nearest_gdf.shape

(76057, 28)

In [59]:
pems_match_gdf[pems_match_gdf.station == 400260]

Unnamed: 0,station,longitude,latitude,route,direction,type,point,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,geometry,snap_distance
22,400260.0,-122.393983,37.703072,101,S,ML,POINT (-2116405.519119958 4604919.938460853),e0baa6ac9c09cf1bf3a86f61942ae2a0,motorway,101,S,LINESTRING (-2116265.063105721 4605784.1925672...,1.093781
384,400260.0,-122.394455,37.706585,101,N,ML,POINT (-2116310.310158523 4605334.623277964),b958a1373a6c1ce3a73a6452ad1ddb96,motorway,101,N,LINESTRING (-2116435.958658942 4604533.4837011...,1.892163
385,400260.0,-122.393983,37.703072,101,N,ML,POINT (-2116405.519119958 4604919.938460853),b958a1373a6c1ce3a73a6452ad1ddb96,motorway,101,N,LINESTRING (-2116435.958658942 4604533.4837011...,28.904669


In [60]:
pems_match_gdf[pems_match_gdf.station.isin([407358, 407359])]

Unnamed: 0,station,longitude,latitude,route,direction,type,point,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,geometry,snap_distance


In [61]:
pems_gdf[pems_gdf.station == 407359]

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,median_occup,avg_occup,sd_occupancy,days_observed,state_pm,abs_pm,latitude,longitude,year,geometry
59818,407359,4,85,S,HV,AM,1,1006.5,991.458333,58.596616,...,0.023288,0.023117,0.001375,24,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59819,407359,4,85,S,HV,EA,1,1.5,1.791667,1.559798,...,0.0001,9.9e-05,8.6e-05,24,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59820,407359,4,85,S,HV,EV,1,2763.5,2715.0,262.26831,...,0.054162,0.0538,0.007297,22,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59821,407359,4,85,S,HV,MD,1,2970.0,3014.391304,226.936262,...,0.046906,0.047906,0.00418,23,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
59822,407359,4,85,S,HV,PM,1,3095.0,3111.695652,230.054625,...,0.061583,0.063031,0.006688,23,1439.0,23.886,37.407147,-122.070055,2015,POINT (-2098536.5811206 4561184.103187212)
69413,407359,4,85,S,HV,AM,1,1064.0,1066.929577,78.441484,...,0.024396,0.024497,0.002095,71,1665.0,23.886,37.407147,-122.070055,2016,POINT (-2098536.5811206 4561184.103187212)
69414,407359,4,85,S,HV,EA,1,1.0,2.27027,2.400327,...,0.0001,0.000128,0.00015,74,1665.0,23.886,37.407147,-122.070055,2016,POINT (-2098536.5811206 4561184.103187212)
69415,407359,4,85,S,HV,EV,1,2906.0,2867.746479,275.940714,...,0.054315,0.054549,0.006617,71,1665.0,23.886,37.407147,-122.070055,2016,POINT (-2098536.5811206 4561184.103187212)
69416,407359,4,85,S,HV,MD,1,3110.0,3126.53125,266.418454,...,0.049394,0.049818,0.004694,64,1665.0,23.886,37.407147,-122.070055,2016,POINT (-2098536.5811206 4561184.103187212)
69417,407359,4,85,S,HV,PM,1,3218.5,3197.764706,206.22384,...,0.064881,0.067903,0.011518,68,1665.0,23.886,37.407147,-122.070055,2016,POINT (-2098536.5811206 4561184.103187212)


In [62]:
pems_nearest_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 76057 entries, 0 to 76056
Data columns (total 28 columns):
station             76057 non-null int64
district            76057 non-null int64
route               76057 non-null int64
direction           76057 non-null object
type                76057 non-null object
time_period         76057 non-null object
lanes               76057 non-null int64
median_flow         76057 non-null float64
avg_flow            76057 non-null float64
sd_flow             76057 non-null float64
median_speed        60350 non-null float64
avg_speed           60350 non-null float64
sd_speed            60350 non-null float64
median_occup        76057 non-null float64
avg_occup           76057 non-null float64
sd_occupancy        76057 non-null float64
days_observed       76057 non-null int64
state_pm            76057 non-null float64
abs_pm              76057 non-null float64
latitude            76057 non-null float64
longitude           76057 non-null f

In [63]:
pems_nearest_gdf[pems_nearest_gdf.shstReferenceId.notnull()].station.nunique()

2440

In [64]:
pems_nearest_gdf[pems_nearest_gdf.shstReferenceId.isnull()].station.nunique()

1286

In [65]:
pems_nearest_gdf[pems_nearest_gdf.shstReferenceId.notnull()]["type"].value_counts()

ML    54576
OR     1476
FR      658
FF      201
HV       15
Name: type, dtype: int64

In [69]:
pems_nearest_not_matched_df = pems_nearest_gdf[pems_nearest_gdf.shstReferenceId.isnull()].copy()

In [70]:
pems_nearest_not_matched_df

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,abs_pm,latitude,longitude,year,geometry,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,snap_distance
15,400009,4,80,W,ML,AM,5,16085.0,15930.111111,577.366524,...,10.940,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
16,400009,4,80,W,ML,EA,5,4374.0,4376.000000,95.348801,...,10.940,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
17,400009,4,80,W,ML,EV,5,10881.0,11121.137931,2305.214349,...,10.940,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
18,400009,4,80,W,ML,MD,5,17734.0,17643.592593,953.089319,...,10.940,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
19,400009,4,80,W,ML,PM,5,12425.0,12358.562500,972.879893,...,10.940,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
106,400060,4,80,W,ML,AM,5,11553.0,11468.783784,404.328946,...,11.450,37.870977,-122.305289,2005,POINT (-2101806.466966349 4621266.028712609),,,,,
107,400060,4,80,W,ML,EA,5,1876.0,1874.604651,91.501012,...,11.450,37.870977,-122.305289,2005,POINT (-2101806.466966349 4621266.028712609),,,,,
108,400060,4,80,W,ML,EV,5,15009.0,15072.648649,1057.086831,...,11.450,37.870977,-122.305289,2005,POINT (-2101806.466966349 4621266.028712609),,,,,
109,400060,4,80,W,ML,MD,5,17398.0,17044.432432,1494.305218,...,11.450,37.870977,-122.305289,2005,POINT (-2101806.466966349 4621266.028712609),,,,,
110,400060,4,80,W,ML,PM,5,15896.0,15203.186047,1968.801185,...,11.450,37.870977,-122.305289,2005,POINT (-2101806.466966349 4621266.028712609),,,,,


In [72]:
pems_nearest_not_matched_df.head(3)

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,abs_pm,latitude,longitude,year,geometry,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,snap_distance
15,400009,4,80,W,ML,AM,5,16085.0,15930.111111,577.366524,...,10.94,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
16,400009,4,80,W,ML,EA,5,4374.0,4376.0,95.348801,...,10.94,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,
17,400009,4,80,W,ML,EV,5,10881.0,11121.137931,2305.214349,...,10.94,37.863905,-122.303083,2005,POINT (-2101884.901027649 4620392.963378702),,,,,


## For Pems not matched using the nearest match method, match them to the closest link with the same facility type, within x meter buffer

In [151]:
from functools import partial
import pyproj
from shapely.ops import transform
from shapely.geometry import Point, Polygon

proj_wgs84 = pyproj.Proj('+proj=longlat +datum=WGS84')

offset = 100

unique_pems_nearest_not_matched_df = pems_nearest_not_matched_df.drop_duplicates(
    subset = ["station", "type", "latitude", "longitude"]
)

link_df = link_df.to_crs({'init': 'epsg:4326'})

In [126]:
def geodesic_point_buffer(lat, lon, meters):
    # Azimuthal equidistant projection
    aeqd_proj = '+proj=aeqd +lat_0={lat} +lon_0={lon} +x_0=0 +y_0=0'
    project = partial(
        pyproj.transform,
        pyproj.Proj(aeqd_proj.format(lat=lat, lon=lon)),
        proj_wgs84)
    buf = Point(0, 0).buffer(meters)  # distance in metres
    return Polygon(transform(project, buf).exterior.coords[:])

def links_within_stop_buffer(drive_link_df, stop, buffer_radius = 25):
    """
    find the links that are within buffer of nodes
    """
    
    stop_buffer_df = stop.copy()
    stop_buffer_df["geometry"] = stop_buffer_df.apply(lambda x: geodesic_point_buffer(x.latitude, x.longitude, buffer_radius), 
                                                      axis = 1)
    
    stop_buffer_df = gpd.GeoDataFrame(stop_buffer_df, geometry = stop_buffer_df["geometry"], crs = {'init' : 'epsg:4326'})
    
    stop_buffer_link_df = gpd.sjoin(drive_link_df, stop_buffer_df[["geometry", "type"]], 
                                             how = "left", op = "intersects")
    
    stop_buffer_link_df = stop_buffer_link_df[stop_buffer_link_df["type"].notnull()]
    
    return stop_buffer_link_df["shstReferenceId"].tolist()

In [161]:
import warnings
warnings.filterwarnings('ignore')

In [167]:
%%time

# match based on facility type

pems_matched_ft_gdf = gpd.GeoDataFrame()

for i in range(len(unique_pems_nearest_not_matched_df)):
    
    row_df = unique_pems_nearest_not_matched_df.iloc[[i]][["station", "type", "latitude", "longitude"]].copy()
    
    print("process station {}, type {}".format(row_df["station"].iloc[0], row_df["type"].iloc[0]))
    
    link = link_df[(link_df.roadway.isin(pems_roadway_crosswalk[row_df["type"].iloc[0]]))].copy()
    
    links_within_buffer = links_within_stop_buffer(link, row_df, offset)
    
    row_df["geometry"] = row_df.apply(lambda x: Point(x.longitude, x.latitude), axis = 1)
    
    # find closest line to point
    
    links_within_buffer_gdf = link_df[link_df.shstReferenceId.isin(links_within_buffer)].copy()
    
    links_within_buffer_gdf["station"] = row_df["station"].iloc[0]
    links_within_buffer_gdf["type"] = row_df["type"].iloc[0]
    links_within_buffer_gdf["latitude"] = row_df["latitude"].iloc[0]
    links_within_buffer_gdf["longitude"] = row_df["longitude"].iloc[0]
    links_within_buffer_gdf["point"] = row_df["geometry"].iloc[0]
        
    links_within_buffer_gdf["snap_distance"] = links_within_buffer_gdf.geometry.distance(
        gpd.GeoSeries(links_within_buffer_gdf.point)
    )
        
    links_within_buffer_gdf.sort_values(by = ["snap_distance"], inplace = True)
        
    closest_link_gdf = links_within_buffer_gdf.groupby(["station", "longitude", "latitude"]).first().reset_index()
    
    pems_matched_ft_gdf = pd.concat([pems_matched_ft_gdf, closest_link_gdf], sort = False, ignore_index = True)

process station 400009, type ML
process station 400060, type ML
process station 400108, type ML
process station 400126, type ML
process station 400170, type ML
process station 400176, type ML
process station 400214, type ML
process station 400242, type ML
process station 400280, type ML
process station 400283, type ML
process station 400340, type ML
process station 400349, type ML
process station 400367, type ML
process station 400403, type ML
process station 400432, type ML
process station 400433, type FF
process station 400467, type FF
process station 400553, type ML
process station 400612, type ML
process station 400679, type ML
process station 400691, type ML
process station 400696, type ML
process station 400704, type ML
process station 400712, type ML
process station 400728, type ML
process station 400749, type ML
process station 400768, type ML
process station 400803, type ML
process station 400808, type ML
process station 400923, type ML
process station 400962, type ML
process 

process station 402954, type OR
process station 402955, type OR
process station 402956, type FR
process station 402957, type OR
process station 402959, type OR
process station 402960, type OR
process station 402961, type OR
process station 402962, type FR
process station 402963, type OR
process station 402964, type OR
process station 402965, type OR
process station 402966, type OR
process station 402967, type FR
process station 402970, type OR
process station 402971, type OR
process station 402974, type FR
process station 402975, type OR
process station 402976, type OR
process station 402977, type OR
process station 402978, type FR
process station 402979, type OR
process station 402980, type FR
process station 402982, type OR
process station 402984, type OR
process station 402989, type OR
process station 402990, type OR
process station 402991, type FR
process station 402993, type OR
process station 402994, type OR
process station 402995, type OR
process station 402996, type OR
process 

process station 403182, type FR
process station 403184, type OR
process station 403185, type FR
process station 403187, type OR
process station 403189, type FR
process station 403214, type FF
process station 403239, type OR
process station 403300, type OR
process station 403322, type OR
process station 403330, type FR
process station 403331, type OR
process station 403339, type OR
process station 403341, type FR
process station 403342, type OR
process station 403344, type FR
process station 403345, type OR
process station 403347, type FR
process station 403348, type OR
process station 403350, type OR
process station 403358, type ML
process station 403359, type ML
process station 403360, type ML
process station 403394, type FR
process station 403410, type OR
process station 403411, type FR
process station 403413, type OR
process station 403415, type FR
process station 403416, type FR
process station 403454, type OR
process station 403998, type ML
process station 404291, type ML
process 

process station 407340, type FR
process station 407349, type OR
process station 407351, type FR
process station 407353, type OR
process station 407358, type FF
process station 407359, type HV
process station 407363, type FR
process station 407365, type OR
process station 407368, type OR
process station 407745, type FF
process station 407746, type FR
process station 407749, type OR
process station 407863, type ML
process station 407864, type FR
process station 407867, type OR
process station 407868, type FR
process station 407870, type OR
process station 407882, type ML
process station 407948, type FR
process station 407949, type OR
process station 407953, type OR
process station 407954, type OR
process station 407955, type FR
process station 407980, type FR
process station 407981, type OR
process station 407991, type OR
process station 407993, type FR
process station 407994, type OR
process station 408004, type FR
process station 408005, type OR
process station 408006, type FR
process 

process station 409937, type OR
process station 409939, type OR
process station 409944, type OR
process station 409945, type FR
process station 409947, type FR
process station 409957, type FR
process station 409967, type OR
process station 409999, type OR
process station 410000, type FR
process station 410002, type FR
process station 410085, type OR
process station 410086, type FR
process station 410087, type OR
process station 410094, type FR
process station 410095, type OR
process station 410101, type OR
process station 410103, type FR
process station 410107, type OR
process station 410109, type FR
process station 410110, type OR
process station 410111, type FR
process station 410113, type OR
process station 410115, type OR
process station 410116, type FR
process station 410141, type FR
process station 410144, type FR
process station 410145, type OR
process station 410160, type OR
process station 410164, type FR
process station 410166, type OR
process station 410211, type OR
process 

process station 412440, type OR
process station 412441, type FR
process station 412442, type OR
process station 412445, type FR
process station 413037, type OR
process station 413038, type FR
process station 413206, type OR
process station 413819, type OR
process station 413845, type ML
process station 413869, type FR
process station 413879, type OR
process station 413880, type FR
process station 414029, type OR
process station 414030, type FR
process station 414045, type OR
process station 414304, type OR
process station 414310, type OR
process station 414478, type OR
process station 414479, type FR
process station 414488, type OR
process station 414695, type FR
process station 414696, type OR
process station 414706, type OR
process station 414796, type FR
process station 414797, type FR
process station 414798, type OR
process station 414799, type OR
process station 414820, type FR
process station 414821, type OR
process station 414822, type FR
process station 414918, type FR
process 

In [169]:
pems_use_ft_result_df = pd.merge(pems_nearest_not_matched_df.drop("shstReferenceId", axis = 1),
                                   pems_matched_ft_gdf[["shstReferenceId", 'station', 'longitude', 'latitude', 'type']],
                                   how = "left",
                                   on = ['station', 'type','longitude', 'latitude'])

In [181]:
pems_use_ft_result_df[(pems_use_ft_result_df.shstReferenceId.isnull()) & (pems_use_ft_result_df["type"] == "ML")]

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,abs_pm,latitude,longitude,year,geometry,roadway,tomtom_shieldnum,tomtom_rtedir,snap_distance,shstReferenceId
4429,403471,4,17,N,ML,AM,2,8293.0,7839.237288,1056.806728,...,5.45,37.06113,-122.004466,2013,POINT (-2105821.363014829 4519711.59987066),,,,,
4430,403471,4,17,N,ML,EA,2,1618.0,1604.821429,63.59656,...,5.45,37.06113,-122.004466,2013,POINT (-2105821.363014829 4519711.59987066),,,,,
4431,403471,4,17,N,ML,EV,2,2773.5,2720.727273,460.612344,...,5.45,37.06113,-122.004466,2013,POINT (-2105821.363014829 4519711.59987066),,,,,
4432,403471,4,17,N,ML,MD,2,6175.0,6005.883333,883.55906,...,5.45,37.06113,-122.004466,2013,POINT (-2105821.363014829 4519711.59987066),,,,,
4433,403471,4,17,N,ML,PM,2,5133.0,5090.069767,662.887385,...,5.45,37.06113,-122.004466,2013,POINT (-2105821.363014829 4519711.59987066),,,,,
4434,403475,4,17,S,ML,AM,2,3395.0,3221.186441,501.359066,...,5.45,37.061194,-122.004672,2013,POINT (-2105837.76253553 4519725.204552194),,,,,
4435,403475,4,17,S,ML,EA,2,213.0,198.051724,93.424784,...,5.45,37.061194,-122.004672,2013,POINT (-2105837.76253553 4519725.204552194),,,,,
4436,403475,4,17,S,ML,EV,2,5174.5,5208.636364,373.960852,...,5.45,37.061194,-122.004672,2013,POINT (-2105837.76253553 4519725.204552194),,,,,
4437,403475,4,17,S,ML,MD,2,6120.0,6097.9,1141.555016,...,5.45,37.061194,-122.004672,2013,POINT (-2105837.76253553 4519725.204552194),,,,,
4438,403475,4,17,S,ML,PM,2,7447.0,7321.744186,566.643421,...,5.45,37.061194,-122.004672,2013,POINT (-2105837.76253553 4519725.204552194),,,,,


In [170]:
pems_use_ft_result_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 19131 entries, 0 to 19130
Data columns (total 28 columns):
station             19131 non-null int64
district            19131 non-null int64
route               19131 non-null int64
direction           19131 non-null object
type                19131 non-null object
time_period         19131 non-null object
lanes               19131 non-null int64
median_flow         19131 non-null float64
avg_flow            19131 non-null float64
sd_flow             19131 non-null float64
median_speed        5759 non-null float64
avg_speed           5759 non-null float64
sd_speed            5759 non-null float64
median_occup        19131 non-null float64
avg_occup           19131 non-null float64
sd_occupancy        19131 non-null float64
days_observed       19131 non-null int64
state_pm            19131 non-null float64
abs_pm              19131 non-null float64
latitude            19131 non-null float64
longitude           19131 non-null floa

In [54]:
pems_nearest_not_matched_df = pems_nearest_not_matched_df.round({"longitude" : 6, "latitude" : 6})

# Concat match result from two methods

In [171]:
pems_nearest_matched_df = pems_nearest_gdf[pems_nearest_gdf.shstReferenceId.notnull()].copy()

In [172]:
pems_conflation_result_df = pd.concat(
        [pems_nearest_matched_df, pems_use_ft_result_df],
        sort = False,
        ignore_index = True
)

In [173]:
pems_conflation_result_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 76057 entries, 0 to 76056
Data columns (total 28 columns):
station             76057 non-null int64
district            76057 non-null int64
route               76057 non-null int64
direction           76057 non-null object
type                76057 non-null object
time_period         76057 non-null object
lanes               76057 non-null int64
median_flow         76057 non-null float64
avg_flow            76057 non-null float64
sd_flow             76057 non-null float64
median_speed        60350 non-null float64
avg_speed           60350 non-null float64
sd_speed            60350 non-null float64
median_occup        76057 non-null float64
avg_occup           76057 non-null float64
sd_occupancy        76057 non-null float64
days_observed       76057 non-null int64
state_pm            76057 non-null float64
abs_pm              76057 non-null float64
latitude            76057 non-null float64
longitude           76057 non-null f

In [176]:
pems_conflation_result_df= pems_conflation_result_df.to_crs(epsg = 4326)

In [None]:
pems_conflation_result_df.to_file(data_interim_dir + "mtc/pems_conflation_result.geojson",
                                  driver = "GeoJSON")

In [179]:
pems_conflation_result_df.head(3)

Unnamed: 0,station,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,...,abs_pm,latitude,longitude,year,geometry,shstReferenceId,roadway,tomtom_shieldnum,tomtom_rtedir,snap_distance
0,400000,4,101,S,ML,AM,3,16383.0,16237.843137,691.559263,...,459.52,38.081167,-122.547606,2005,POINT (-122.547606 38.081167),df209cac09d05a2f7e0a22dc8e931df8,motorway,101,S,3.17488
1,400000,4,101,S,ML,EA,3,4366.5,4376.357143,102.251993,...,459.52,38.081167,-122.547606,2005,POINT (-122.547606 38.081167),df209cac09d05a2f7e0a22dc8e931df8,motorway,101,S,3.17488
2,400000,4,101,S,ML,EV,3,8523.0,8434.255814,420.135754,...,459.52,38.081167,-122.547606,2005,POINT (-122.547606 38.081167),df209cac09d05a2f7e0a22dc8e931df8,motorway,101,S,3.17488


In [64]:
pems_gdf.station.nunique()

3700

In [65]:
pems_conflation_result_df.station.nunique()

3700

In [69]:
pems_conflation_result_df.groupby(["shstReferenceId", "station", "year"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,district,route,direction,type,time_period,lanes,median_flow,avg_flow,sd_flow,median_speed,...,days_observed,state_pm,abs_pm,latitude,longitude,geometry,tomtom_shieldnum,tomtom_rtedir,snap_distance,source
shstReferenceId,station,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
000619909cfedfdc3ae846759247e09f,404433,2014,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
000619909cfedfdc3ae846759247e09f,407341,2016,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
000619909cfedfdc3ae846759247e09f,407341,2017,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2005,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2006,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2007,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2008,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2009,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2010,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0
001faa1f8bf0fafb4298b7438a83b506,400615,2011,5,5,5,5,5,5,5,5,5,5,...,5,5,5,5,5,5,5,5,5,0


# Create Pems station and model link id crosswalk
Added 11/11/2024
This is used for roadway acceptance criteria

In [None]:
# local i/o path
data_interim_dir = r"\\corp.pbwan.net\us\CentralData\DCCLDA00\Standard\sag\projects\MTC\31000152\Network_Rebuild\travel-model-two-networks\data\interim"
links_df = r"\\corp.pbwan.net\us\CentralData\DCCLDA00\Standard\sag\projects\MTC\31000152\Network_Rebuild\LP_Local_folders\data\processed\version_12_with_link21_fixes"

In [4]:
# https://mtcdrive.box.com/s/etw75p1dj0h2f8wpqctwmn9zjykhzr95 (data_interim_dir)
pems_conflation_result_df = gpd.read_file(data_interim_dir + "/mtc/pems_conflation_result.geojson")

# https://mtcdrive.box.com/s/iw85645eet3zvsw59wnvfgkprbmg0uix (links_dir)
links_df = pd.read_json(links_dir + "/v12_link.json")

In [5]:
pems_conflation_result_df.columns

Index(['station', 'district', 'route', 'direction', 'type', 'time_period',
       'lanes', 'median_flow', 'avg_flow', 'sd_flow', 'median_speed',
       'avg_speed', 'sd_speed', 'median_occup', 'avg_occup', 'sd_occupancy',
       'days_observed', 'state_pm', 'abs_pm', 'latitude', 'longitude', 'year',
       'shstReferenceId', 'roadway', 'tomtom_shieldnum', 'tomtom_rtedir',
       'snap_distance', 'geometry'],
      dtype='object')

In [46]:
# there are stations (e.g., 402095) that have records for multiple years, but have different lat & long and are on different freeway segments among those years.
stations_2015 = pems_conflation_result_df[pems_conflation_result_df.year == 2015].station.unique()
stations_not_2015 = pems_conflation_result_df[pems_conflation_result_df.year != 2015].station.unique()

# drop station records if they already have a 2015 record
pems_df = pems_conflation_result_df[
    ~(
        pems_conflation_result_df.station.isin(stations_2015) & 
        pems_conflation_result_df.station.isin(stations_not_2015) &
        (pems_conflation_result_df.year != 2015)
    )
].copy()

pems_df = pems_df.sort_values("year", ascending=False)

In [48]:
pems_df.groupby(["station","district","route","direction","type","shstReferenceId"])["shstReferenceId"].count()

station  district  route  direction  type  shstReferenceId                 
400000   4         101    S          ML    df209cac09d05a2f7e0a22dc8e931df8    15
400001   4         101    N          ML    b884a990917d6d09cb42cf2241b25fea     5
400002   4         101    S          ML    dfd26b2508d008ad867b43ce2eae005f    10
400005   4         80     W          ML    f278b4aca20da2c831169c28f84d7620    10
400006   4         880    S          ML    a76e5d7afe487513999d25dd5f29c474     5
                                                                               ..
419490   4         80     W          OR    a05a0ec01eac17f2145d51dc7df1cc73     5
419660   4         37     E          FR    ed170334b0ed05f7da63824ddbdacaca     1
419661   4         37     E          OR    42ddf62777fd2b0b16bb20056419139a     1
419663   4         37     W          OR    6f95d5934be755865d535cf56e2baf0c     1
419665   4         37     W          OR    a2218a5209d557045ff639979883f8eb     1
Name: shstReferenceId,

In [62]:
# create a pems station - model link id crosswalk, columns include
# station,district,route,direction,type,latitude,longitude,shstReferenceId,A,B,model_link_id,A_B

unique_pems_df = pems_df[
    ["station","district","route","direction","type","year","latitude","longitude","shstReferenceId"]
    ].drop_duplicates(
    subset=["station","district","route","direction","type"],
).copy()

crosswalk_df = pd.merge(
    unique_pems_df[unique_pems_df.shstReferenceId.notnull()],
    links_df[["shstReferenceId","model_link_id","A","B"]],
    how = "inner",
    on = "shstReferenceId"
)
crosswalk_df = crosswalk_df[crosswalk_df.model_link_id.notnull()].sort_values("station")

In [63]:
crosswalk_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3627 entries, 216 to 1340
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   station          3627 non-null   int64  
 1   district         3627 non-null   int64  
 2   route            3627 non-null   int64  
 3   direction        3627 non-null   object 
 4   type             3627 non-null   object 
 5   year             3627 non-null   int64  
 6   latitude         3627 non-null   float64
 7   longitude        3627 non-null   float64
 8   shstReferenceId  3627 non-null   object 
 9   model_link_id    3627 non-null   int64  
 10  A                3627 non-null   int64  
 11  B                3627 non-null   int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 368.4+ KB


In [64]:
crosswalk_df

Unnamed: 0,station,district,route,direction,type,year,latitude,longitude,shstReferenceId,model_link_id,A,B
216,400000,4,101,S,ML,2017,38.081498,-122.547963,df209cac09d05a2f7e0a22dc8e931df8,8029515,5014656,5009642
2908,400001,4,101,N,ML,2015,37.364085,-121.901149,b884a990917d6d09cb42cf2241b25fea,2196953,2144635,2119598
138,400002,4,101,S,ML,2014,37.584097,-122.328465,dfd26b2508d008ad867b43ce2eae005f,1066084,1545450,1512814
888,400005,4,80,W,ML,2007,37.954132,-122.328900,f278b4aca20da2c831169c28f84d7620,4120358,3021213,3036831
2906,400006,4,880,S,ML,2015,37.605003,-122.065542,a76e5d7afe487513999d25dd5f29c474,3111133,2584626,2592869
...,...,...,...,...,...,...,...,...,...,...,...,...
1117,419490,4,80,W,OR,2017,38.099842,-122.229874,a05a0ec01eac17f2145d51dc7df1cc73,5038509,3530171,3533168
1118,419660,4,37,E,FR,2017,38.136770,-122.257039,ed170334b0ed05f7da63824ddbdacaca,5056971,3517907,3522915
1119,419661,4,37,E,OR,2017,38.137844,-122.255865,42ddf62777fd2b0b16bb20056419139a,5016067,3514841,3522281
0,419663,4,37,W,OR,2017,38.137248,-122.256532,6f95d5934be755865d535cf56e2baf0c,5026774,3522915,3520415


In [65]:
crosswalk_df[crosswalk_df.station == 402840]

Unnamed: 0,station,district,route,direction,type,year,latitude,longitude,shstReferenceId,model_link_id,A,B
1954,402840,4,17,N,FR,2015,37.190197,-121.992771,1390a3fabec4fb1ae0d56e2725dda348,2021039,2020767,2007280


In [66]:
crosswalk_df[crosswalk_df.station == 400078]

Unnamed: 0,station,district,route,direction,type,year,latitude,longitude,shstReferenceId,model_link_id,A,B
3290,400078,4,680,N,ML,2014,37.913307,-122.066752,992bd6db93e406f66531b96063a89987,4076162,3060168,3044735


In [67]:
crosswalk_df.to_csv(data_interim_dir + "/mtc/pems_station_to_TM2_links_crosswalk.csv", index=False)

In [68]:
links_df[links_df.model_link_id.isin([2188319,2217718])]

Unnamed: 0,access,bike_access,drive_access,fromIntersectionId,lanes,maxspeed,name,oneWay,ref,roadway,...,transit_access,managed,ML_lanes,segment_id,ML_tollbooth,ML_useclass,ML_access,ML_egress,ML_tollseg,tollseg
598003,"['nan', 'nan', 'nan']",True,True,c940c03d00ca0c96c95ff26452dabc7a,2,"['nan', 'nan', 'nan']","['nan', 'nan', 'nan']","[True, True, True]","['nan', 'nan', 'nan']",motorway_link,...,,0,,,,,,,,
691876,"['nan', 'nan']",True,True,ed3e55f1a8563a9bf25aba345400938c,2,"['nan', 'nan']","['nan', 'nan']","[True, True]","['nan', 'nan']",motorway_link,...,,0,,,,,,,,
