# Write functions to find the closest point on a route to our actual data point

Started by Nathaniel on Sunday, June 9, 2019

In [1]:
%load_ext autoreload
%autoreload 2

!date
!whoami

import numpy as np
import pandas as pd

Sat Jun 15 18:17:55 PDT 2019
ndbs


## Import my closest point module and read in a GTFS `shapes.txt` file

In [2]:
import find_closest_route_point as f

In [3]:
!ls ../data/source/gtfs_20180815/

[31magency.txt[m[m          [31mcalendar.txt[m[m        [31mfare_rules.txt[m[m      [31mstop_times.txt[m[m
[31mblock.txt[m[m           [31mcalendar_dates.txt[m[m  [31mroutes.txt[m[m          [31mstops.txt[m[m
[31mblock_trip.txt[m[m      [31mfare_attributes.txt[m[m [31mshapes.txt[m[m          [31mtrips.txt[m[m


In [4]:
shapes_df = pd.read_csv('../data/source/gtfs_20180815/shapes.txt')
shapes_df.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,10002011,47.612137,-122.281769,1,0.0
1,10002011,47.612144,-122.281784,2,5.8
2,10002011,47.612148,-122.28183,3,13.5
3,10002011,47.612141,-122.281853,4,22.0
4,10002011,47.612102,-122.281921,5,45.0


## Explore some Python syntax: Test how list unpacking works with assignment

In [5]:
# Test argument unpacking with arrays
a = np.array([4,8])
x,y = a
x

4

In [6]:
# Test more argument unpacking
b = np.array([0,0])
c, *b = x, y, 5
b

[8, 5]

In [7]:
*b, = a
b

[4, 8]

In [8]:
b is a

False

In [9]:
type(b) #Ok, so b just got reinitialized. It did NOT simply fill in the array with the values in a.

list

In [10]:
type(a)

numpy.ndarray

## Get data for a sample point (at index 2) for testing

In [11]:
shape_id, lat, lon, seq, dist = shapes_df.iloc[2]

In [12]:
shape_id

10002011.0

In [13]:
lat

47.612148299999994

## Try finding adjacent points

Still working on the best method for this part... It might be best to have one function that does this as well as computes the total shape distance to the projected point. Or perhaps just return the indices of these points, so that we can access `shape_dist_traveled` for them later.

In [14]:
point_data = shapes_df[(shapes_df.shape_pt_lat==lat) & (shapes_df.shape_pt_lon==lon) & (shapes_df.shape_id==shape_id)]
point_data

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
2,10002011,47.612148,-122.28183,3,13.5


In [15]:
point_data.shape_pt_sequence

2    3
Name: shape_pt_sequence, dtype: int64

In [16]:
mask = (shapes_df.shape_pt_lat==lat) & (shapes_df.shape_pt_lon==lon) & (shapes_df.shape_id==shape_id)
type(mask)

pandas.core.series.Series

In [17]:
mask.head()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [19]:
adjacent_points = np.empty((2,2))
adjacent_points.fill(np.nan)

In [20]:
prev_mask = (shapes_df.shape_id==shape_id) & (shapes_df.shape_pt_sequence == seq-1)
prev_mask.head()

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [21]:
any(prev_mask)

True

In [22]:
adjacent_points[0] = ((shapes_df[prev_mask].shape_pt_lat, shapes_df[prev_mask].shape_pt_lon)
                      if any(prev_mask) else np.nan)
adjacent_points

array([[  47.6121445, -122.281784 ],
       [         nan,          nan]])

In [23]:
shapes_df.loc[prev_mask, ['shape_pt_lat', 'shape_pt_lon']]

Unnamed: 0,shape_pt_lat,shape_pt_lon
1,47.612144,-122.281784


In [24]:
shapes_df[prev_mask]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8


In [25]:
adjacent_mask = (shapes_df.shape_id==shape_id) & (np.abs(shapes_df.shape_pt_sequence-seq)==1)
adjacent_mask.head()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [26]:
shapes_df[adjacent_mask]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


In [27]:
#Note: The 'find_adjacent_shape_point_data' function is deprecated in favor of the newer pair of functions:
#point_data = f.get_shape_point_data(shapes_df, shape_id, lat, lon) and
#adjacent_point_data = f.get_adjacent_shape_point_data(shapes_df, point_data.index[0])
f.find_adjacent_shape_point_data(lat, lon, shapes_df, shape_id)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


# Check beginning and end of shape

In [28]:
shape_id, lat, lon, seq, dist = shapes_df.iloc[0]
#Note: The 'find_adjacent_shape_point_data' function is deprecated in favor of the newer pair of functions:
#point_data = f.get_shape_point_data(shapes_df, shape_id, lat, lon) and
#adjacent_point_data = f.get_adjacent_shape_point_data(shapes_df, point_data.index[0])
f.find_adjacent_shape_point_data(lat, lon, shapes_df, shape_id)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8


In [29]:
shapes_df.loc[shapes_df.shape_id==shape_id,'shape_pt_sequence'].max()

201

In [30]:
shapes_df.iloc[200]

shape_id               1.000201e+07
shape_pt_lat           4.760922e+01
shape_pt_lon          -1.223301e+02
shape_pt_sequence      2.010000e+02
shape_dist_traveled    1.605600e+04
Name: 200, dtype: float64

In [31]:
shape_id, lat, lon, seq, dist = shapes_df.iloc[200]
#Note: The 'find_adjacent_shape_point_data' function is deprecated in favor of the newer pair of functions:
#point_data = f.get_shape_point_data(shapes_df, shape_id, lat, lon) and
#adjacent_point_data = f.get_adjacent_shape_point_data(shapes_df, point_data.index[0])
f.find_adjacent_shape_point_data(lat, lon, shapes_df, shape_id)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
199,10002011,47.609356,-122.329788,200,15959.7


## Try new index-based method

In [39]:
point_data.index

Int64Index([2], dtype='int64')

In [34]:
point_data.index +1

Int64Index([3], dtype='int64')

In [37]:
adjacent_indices = pd.Index([point_data.index-1, point_data.index+1])
adjacent_indices

Index([[1], [3]], dtype='object')

In [42]:
idx = point_data.index[0]
idx

2

In [43]:
adjacent_indices = [max(idx-1,0), min(idx+1, len(shapes_df)-1)]
adjacent_indices

[1, 3]

In [45]:
shapes_df.loc[adjacent_indices]

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


In [71]:
point_data.at[2,'shape_pt_sequence']

3

## Test the new method

In [85]:
df_i = f.get_adjacent_shape_point_data(shapes_df, 2, True, False)
df_i

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


In [86]:
df_s = f.get_adjacent_shape_point_data(shapes_df, 2, False, True)
df_s

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


In [87]:
df_i == df_s

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,True,True,True,True,True
3,True,True,True,True,True


In [88]:
all(df_i == df_s)

True

In [90]:
f.get_adjacent_shape_point_data(shapes_df, 2, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8
3,10002011,47.612141,-122.281853,4,22.0


## Test some edge cases

In [91]:
#First point in the dataframe - only the next point should be returned
f.get_adjacent_shape_point_data(shapes_df, 0, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
1,10002011,47.612144,-122.281784,2,5.8


In [92]:
#Last point for the first shape - only the previous point should be returned
f.get_adjacent_shape_point_data(shapes_df, 200, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
199,10002011,47.609356,-122.329788,200,15959.7


In [93]:
#First point for the second shape - only the next point should be returned
f.get_adjacent_shape_point_data(shapes_df, 201, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
202,10002012,47.612144,-122.281784,2,5.8


In [94]:
#Last point in the dataframe  - only the previous point should be returned
f.get_adjacent_shape_point_data(shapes_df, len(shapes_df)-1, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
373893,41982011,47.673569,-122.101059,610,131409.5


In [96]:
#Second to last point in the dataframe - both previous and next point should be returned
f.get_adjacent_shape_point_data(shapes_df, len(shapes_df)-2, True, True)

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
373892,41982011,47.673351,-122.10099,609,131328.0
373894,41982011,47.673599,-122.101059,611,131419.0


## Test more stuff

In [98]:
point_data = f.get_shape_point_data(shapes_df, shape_id, lat, lon)
point_data

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
200,10002011,47.609219,-122.330116,201,16056.0


In [99]:
point_data.index[0]

200

In [100]:
#This is how we can call the two point-finding functions from a higher-level function:
point_data = f.get_shape_point_data(shapes_df, shape_id, lat, lon)
adjacent_point_data = f.get_adjacent_shape_point_data(shapes_df, point_data.index[0])

In [101]:
point_data

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
200,10002011,47.609219,-122.330116,201,16056.0


In [102]:
adjacent_point_data

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
199,10002011,47.609356,-122.329788,200,15959.7


In [105]:
point_data[['shape_pt_lat', 'shape_pt_lon']].values

array([[  47.6092186, -122.330116 ]])

In [106]:
point_data[['shape_pt_lat', 'shape_pt_lon']].values.shape

(1, 2)

In [107]:
adjacent_point_data[['shape_pt_lat', 'shape_pt_lon']].values

array([[  47.6093559, -122.329788 ]])

In [111]:
adjacent_point_data[['shape_pt_lat', 'shape_pt_lon']].values.reshape((2,))

array([  47.6093559, -122.329788 ])

In [112]:
adjacent_point_data[['shape_pt_lat', 'shape_pt_lon']].values[0]

array([  47.6093559, -122.329788 ])

In [113]:
adjacent_point_data[['shape_pt_lat', 'shape_pt_lon']].values[0] - adjacent_point_data[['shape_pt_lat', 'shape_pt_lon']].values

array([[0., 0.]])

In [118]:
any([False, False])

False