In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [69]:
import requests
import pandas as pd
import numpy as np
import os

import sklearn.neighbors

import pydeck as pdk
import seaborn as sns
import matplotlib.pyplot as plt

from util import config
from util import mapping
from util import clean_data
from util import process_data

In [31]:
def load_data():
    print('Loading data!')
    trips = pd.read_feather(config.PROCESSED_DATA_PATH + 'trips_culled_scaled.feather')
    trips.set_index('rte_id', inplace=True)

    gridpts_at_rte_1000 = pd.read_feather(config.PROCESSED_DATA_PATH + 'gridpts_at_rte_culled_1000.feather')
    gridpts_at_rte_1000.set_index('rte_id', inplace=True)
    
    grid_pts_1000 = pd.read_feather(config.MODEL_PATH + 'grid_points_culled_1000.feather')
    grid_pts_1000.set_index('grid_id', inplace=True)
        
    return trips, grid_pts_1000, gridpts_at_rte_1000

def load_coarse_grid():
    print('Loading coarse grid!')
    grid_pts_75 = pd.read_feather(config.MODEL_PATH + 'grid_points_culled_75.feather')
    grid_pts_75.set_index('grid_id', inplace=True)
    
    rtes_at_grid_75 = pd.read_feather(config.MODEL_PATH + 'rtes_at_grid_culled_75.feather')
    rtes_at_grid_75.set_index('grid_id', inplace=True)

    loc_tree = sklearn.neighbors.KDTree(grid_pts_75[['lat', 'lon']])
    
    return grid_pts_75, rtes_at_grid_75, loc_tree

def load_presets():
    presets, presets_labels = process_data.set_presets()
    presets = process_data.apply_scaling(presets)
    return (presets, presets_labels)

def fit_tree(df, feature_importance):
    LEAF_SIZE = 20
    return sklearn.neighbors.KDTree(df * feature_importance, leaf_size=LEAF_SIZE)


In [32]:
# Load trip data (fine)
trips, grid_pts_fine, gridpts_at_rte_fine = load_data()

Loading data!


In [33]:
# Load coarser grid data for calculating distances
grid_pts_coarse, rtes_at_grid_coarse, loc_tree = load_coarse_grid()

Loading coarse grid!


In [119]:
# Toggle filtering by distance to some start location
start_location_yn = True
start_lat, start_lon = (42.3, -74.25)
if start_location_yn:
    MAX_DIST_FROM_START = 10 # miles    
    trips_use, unscaled_dists = process_data.add_distance_to_start_feature(
        start_lat, start_lon, trips, grid_pts_coarse, rtes_at_grid_coarse, loc_tree, MAX_DIST_FROM_START
    )
    tree = fit_tree(trips_use, feature_importance + [1])
else:
    trips_use = trips.copy()
    start_lat, start_lon = ('', '')

In [125]:
presets, presets_labels = load_presets()

chosen = presets.loc[[0]]
chosen = pd.DataFrame({'dist': 20.,
                      'avg_slope_climbing': 5.,
                      'avg_slope_descending': -5.,
                      'max_slope': 10,
                      'dist_climbing': 0.4,
                      'dist_downhill': 0.4,
                      'dist_6percent': 0.3,
                      'dist_9percent': 0.1,
                      'dist_12percent': 0.05,
                      'avg_speed': 15},
                     index=['chosen'])
chosen = process_data.apply_scaling(chosen)   

# Calculate nearest neighbours
feature_importance_dict = {'dist': 1.,
                      'avg_slope_climbing': 0.2,
                      'avg_slope_descending': 0.01,
                      'max_slope': 0.2,
                      'dist_climbing': 0.3,
                      'dist_downhill': 0.01,
                      'dist_6percent': 0.2,
                      'dist_9percent': 0.2,
                      'dist_12percent': 0.1,
                      'avg_speed': 0.}
if start_location_yn:
    chosen['dist_to_start'] = 0.
    feature_importance_dict['dist_to_start'] = 1.
feature_sc = [v for v in feature_importance_dict.values()]
tree = fit_tree(trips_use, feature_sc)
dists, df_inds = tree.query(chosen * feature_sc, k=5)
dists, df_inds = dists.flatten(), df_inds.flatten()
neighbour_rte_ids = trips_use.index[df_inds].tolist()

# Find original values of the returned routes
trips_unscaled = process_data.remove_scaling(trips_use.loc[neighbour_rte_ids])
if start_location_yn:
    trips_unscaled['dist_to_start'] = unscaled_dists.loc[neighbour_rte_ids]

chosen_unscaled = process_data.remove_scaling(chosen.drop('dist_to_start', axis=1))
chosen_unscaled['dist_to_start'] = 0.
trips_unscaled.append(chosen_unscaled)


Unnamed: 0,dist,avg_slope_climbing,avg_slope_descending,max_slope,dist_climbing,dist_downhill,dist_6percent,dist_9percent,dist_12percent,avg_speed,dist_to_start
6493727,19.810377,9.689496,-7.64223,19.110922,0.358986,0.249865,0.249584,0.131412,0.06513,10.97336,0.416979
2925278,31.106491,8.695008,-4.941953,20.508899,0.437674,0.248509,0.242811,0.092031,0.032211,15.319856,1.347562
38169100,26.76013,8.247612,-7.203394,19.66199,0.247669,0.229735,0.159288,0.076356,0.021788,12.33191,0.416979
10833164,26.981697,6.568081,-5.095841,19.915955,0.339644,0.054992,0.11382,0.038078,0.01239,16.292124,0.416979
33497172,15.888882,6.750927,-5.055755,14.509788,0.265827,0.15778,0.130706,0.04915,0.013664,11.97166,1.849431
chosen,20.0,5.0,-5.0,10.0,0.4,0.4,0.3,0.1,0.05,15.0,0.0


In [126]:
r = process_data.plot_NN(
    neighbour_rte_ids, grid_pts_fine, gridpts_at_rte_fine,
    (start_lat, start_lon, start_location_yn),
)
r.to_html()
