In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import requests
import pandas as pd
import numpy as np
import os

import sklearn.neighbors

import pydeck as pdk
import seaborn as sns
import matplotlib.pyplot as plt

from util import config
from util import mapping
from util import clean_data
from util import process_data

In [38]:
def load_data():
    trips = pd.read_feather(config.PROCESSED_DATA_PATH + 'trips_scaled.feather')
    trips.set_index('rte_id', inplace=True)

    gridpts_at_rte_500 = pd.read_feather(config.PROCESSED_DATA_PATH + 'gridpts_at_rte_500.feather')
    gridpts_at_rte_500.set_index('rte_id', inplace=True)

    grid_pts_500 = pd.read_feather(config.MODEL_PATH + 'grid_points_500.feather')
    grid_pts_500.set_index('grid_id', inplace=True)

    feature_sc = pd.read_feather(config.MODEL_PATH + 'feature_importance.feather')
    feature_scaling = dict()
    for col in trips.columns:
        if col in feature_sc.feature_names.tolist():
            print(col)
            feature_scaling[col] = abs(
                feature_sc[feature_sc.feature_names == col].scaling.values[0])
        else:
            feature_scaling[col] = 0.
    # Other features
    feature_scaling['popularity'] = 0.5
    feature_scaling['detour_score'] = 0.5


    return trips, grid_pts_500, gridpts_at_rte_500, feature_scaling

def load_coarse_grid():
    grid_pts_75 = pd.read_feather(config.MODEL_PATH + 'grid_points_culled_75.feather')
    grid_pts_75.set_index('grid_id', inplace=True)

    rtes_at_grid_75 = pd.read_feather(config.MODEL_PATH + 'rtes_at_grid_culled_75.feather')
    rtes_at_grid_75.set_index('grid_id', inplace=True)

    loc_tree = sklearn.neighbors.KDTree(grid_pts_75[['lat', 'lon']])

    return grid_pts_75, rtes_at_grid_75, loc_tree

def load_presets():
    presets, presets_labels = process_data.set_presets()
    presets = process_data.apply_scaling(presets)
    return (presets, presets_labels)

def fit_tree(df, feature_importance):
    LEAF_SIZE = 20
    return sklearn.neighbors.KDTree(df * feature_importance, leaf_size=LEAF_SIZE)


In [39]:
# Load trip data (fine)
trips, grid_pts_fine, gridpts_at_rte_fine, fs = load_data()
feature_scaling = fs.copy()

dist
avg_slope_descending
max_slope
dist_6percent
dist_12percent


In [40]:
fs

{'dist': 1.4241848199263392,
 'avg_slope_climbing': 0.0,
 'avg_slope_descending': -0.8510543703883755,
 'max_slope': 0.6191108468100139,
 'dist_climbing': 0.0,
 'dist_downhill': 0.0,
 'dist_6percent': 0.4696368120089681,
 'dist_9percent': 0.0,
 'dist_12percent': 0.19518107303913015,
 'detour_score': 0.5,
 'popularity': 0.5}

In [5]:
# Load coarser grid data for calculating distances
grid_pts_coarse, rtes_at_grid_coarse, loc_tree = load_coarse_grid()

In [6]:
# Toggle filtering by distance to some start location
start_location_yn = False
start_lat, start_lon = (42.3, -74.25)
if start_location_yn:
    MAX_DIST_FROM_START = 10 # miles    
    trips_use, unscaled_dists = process_data.add_distance_to_start_feature(
        start_lat, start_lon, trips, grid_pts_coarse, rtes_at_grid_coarse, loc_tree, MAX_DIST_FROM_START
    )
else:
    trips_use = trips.copy()
    start_lat, start_lon = ('', '')

In [8]:
presets, presets_labels = load_presets()

chosen = presets.loc[[0]]
chosen = pd.DataFrame({'dist': 20.,
                      'avg_slope_climbing': 5.,
                      'avg_slope_descending': -5.,
                      'max_slope': 10,
                      'dist_climbing': 0.4,
                      'dist_downhill': 0.4,
                      'dist_6percent': 0.3,
                      'dist_9percent': 0.1,
                      'dist_12percent': 0.05,
                      'detour_score': 0.3,
                      'popularity': 100},
                     index=['chosen'])
chosen = process_data.engineer_features(chosen)
chosen = process_data.apply_scaling(chosen)   

# Calculate nearest neighbours
if start_location_yn:
    chosen['dist_to_start'] = 0.
    feature_scaling['dist_to_start'] = 2.
feature_sc = [v for v in feature_scaling.values()]

tree = fit_tree(trips_use, feature_sc)
dists, df_inds = tree.query(chosen * feature_sc, k=5)
dists, df_inds = dists.flatten(), df_inds.flatten()
neighbour_rte_ids = trips_use.index[df_inds].tolist()

# Find original values of the returned routes
trips_unscaled = process_data.remove_scaling(trips_use.loc[neighbour_rte_ids])
trips_unscaled = process_data.reverse_engineer_features(trips_unscaled)
if start_location_yn:
    trips_unscaled['dist_to_start'] = unscaled_dists.loc[neighbour_rte_ids]
    chosen_unscaled = process_data.remove_scaling(chosen.drop('dist_to_start', axis=1))
    chosen_unscaled['dist_to_start'] = 0.
else:
    chosen_unscaled = process_data.remove_scaling(chosen)
trips_unscaled.append(chosen_unscaled)


Unnamed: 0,dist,avg_slope_climbing,avg_slope_descending,max_slope,dist_climbing,dist_downhill,dist_6percent,dist_9percent,dist_12percent,detour_score,popularity
18443773,19.481254,6.613961,-5.452964,14.756364,0.272679,0.330908,0.118321,0.063404,0.0108545,0.206052,111.115607
18671576,20.436146,5.807739,-3.965766,15.480596,0.313336,0.259006,0.155255,0.05886,0.03033528,0.23568,91.745342
52777940,33.296581,6.344464,-4.601841,11.529203,0.386165,0.341107,0.271303,0.020947,3.469447e-18,0.288634,51.333333
42228380,14.997048,5.705369,-4.02056,19.551986,0.296277,0.286654,0.138923,0.055443,0.0244952,0.253941,83.058824
39329897,12.588419,6.032077,-4.779729,20.197519,0.363914,0.304657,0.150816,0.067633,0.02896897,0.036286,7.529412
chosen,2.996232,5.0,-5.0,10.0,0.4,0.4,-1.171183,-2.207275,-2.813411,0.3,100.0


In [11]:
feature_sc = pd.read_feather(config.MODEL_PATH + 'feature_importance.feather')
feature_sc

Unnamed: 0,feature_names,scaling
0,dist,1.424185
1,avg_slope_descending,-0.851054
2,max_slope,0.619111
3,dist_6percent,0.469637
4,dist_12percent,0.195181


In [None]:
v0, v2, v3, v6, v8, v9, v10 = 0., 0., 0., 0., 0., 0., 0.
chosen = pd.DataFrame([[v0, 0., v2, v3, 0., 0., v6, 0., v8, v9, v10]],
                    columns=presets.columns)
chosen = process_data.engineer_features(chosen)
chosen

In [None]:
r = process_data.plot_NN(
    neighbour_rte_ids, grid_pts_fine, gridpts_at_rte_fine,
    (start_lat, start_lon, start_location_yn),
)
r.to_html()
