In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import requests
import pandas as pd
import numpy as np
import os

import sklearn.neighbors

import pydeck as pdk
import seaborn as sns
import matplotlib.pyplot as plt

from util import config
from util import mapping
from util import clean_data
from util import process_data

In [3]:
def load_data():
    trips = pd.read_feather(config.PROCESSED_DATA_PATH + 'trips_scaled.feather')
    trips.set_index('rte_id', inplace=True)

    gridpts_at_rte_500 = pd.read_feather(config.PROCESSED_DATA_PATH + 'gridpts_at_rte_500.feather')
    gridpts_at_rte_500.set_index('rte_id', inplace=True)

    grid_pts_500 = pd.read_feather(config.MODEL_PATH + 'grid_points_500.feather')
    grid_pts_500.set_index('grid_id', inplace=True)

    feature_sc = pd.read_feather(config.MODEL_PATH + 'feature_importance.feather')
    feature_scaling = dict()
    for col in trips.columns:
        if col in feature_sc.feature_names.tolist():
            print(col)
            feature_scaling[col] = abs(
                feature_sc[feature_sc.feature_names == col].scaling.values[0])
        else:
            feature_scaling[col] = 0.
    # Other features
    feature_scaling['popularity'] = 0.5
    feature_scaling['detour_score'] = 0.5


    return trips, grid_pts_500, gridpts_at_rte_500, feature_scaling

def load_coarse_grid():
    grid_pts_75 = pd.read_feather(config.MODEL_PATH + 'grid_points_75.feather')
    grid_pts_75.set_index('grid_id', inplace=True)

    rtes_at_grid_75 = pd.read_feather(config.MODEL_PATH + 'rtes_at_grid_75.feather')
    rtes_at_grid_75.set_index('grid_id', inplace=True)

    loc_tree = sklearn.neighbors.KDTree(grid_pts_75[['lat', 'lon']])

    return grid_pts_75, rtes_at_grid_75, loc_tree

def load_presets():
    presets, presets_labels = process_data.set_presets()
    presets = process_data.apply_scaling(presets)
    return (presets, presets_labels)

def fit_tree(df, feature_importance):
    LEAF_SIZE = 20
    return sklearn.neighbors.KDTree(df * feature_importance, leaf_size=LEAF_SIZE)


In [4]:
# Load trip data (fine)
trips, grid_pts_fine, gridpts_at_rte_fine, fs = load_data()
feature_scaling = fs.copy()

dist
avg_slope_descending
max_slope
dist_6percent
dist_12percent


In [5]:
# Load coarser grid data for calculating distances
grid_pts_coarse, rtes_at_grid_coarse, loc_tree = load_coarse_grid()

In [19]:
# Toggle filtering by distance to some start location
start_location_yn = True
start_lat, start_lon = (42., -74.25)
if start_location_yn:
    MAX_DIST_FROM_START = 10 # miles    
    trips_use, unscaled_dists = process_data.add_distance_to_start_feature(
        start_lat, start_lon, trips, grid_pts_coarse, rtes_at_grid_coarse, loc_tree, MAX_DIST_FROM_START
    )
else:
    trips_use = trips.copy()
    start_lat, start_lon = ('', '')

In [20]:
presets, presets_labels = load_presets()

chosen = presets.loc[[0]]
chosen = pd.DataFrame({'dist': 70.,
                      'avg_slope_climbing': 5.,
                      'avg_slope_descending': -6.,
                      'max_slope': 15,
                      'dist_climbing': 0.4,
                      'dist_downhill': 0.4,
                      'dist_6percent': 0.3,
                      'dist_9percent': 0.1,
                      'dist_12percent': 0.05,
                      'detour_score': 0.3,
                      'popularity': 70},
                     index=['chosen'])
chosen = process_data.engineer_features(chosen)
chosen = process_data.apply_scaling(chosen)   

# Calculate nearest neighbours
if start_location_yn:
    chosen['dist_to_start'] = 0.
    feature_scaling['dist_to_start'] = 2.
feature_sc = [v for v in feature_scaling.values()]
feature_sc[2] = 0
tree = fit_tree(trips_use, feature_sc)
dists, df_inds = tree.query(chosen * feature_sc, k=5)
dists, df_inds = dists.flatten(), df_inds.flatten()
neighbour_rte_ids = trips_use.index[df_inds].tolist()

# Find original values of the returned routes
trips_unscaled = process_data.remove_scaling(trips_use.loc[neighbour_rte_ids])
trips_unscaled = process_data.reverse_engineer_features(trips_unscaled)
if start_location_yn:
    trips_unscaled['dist_to_start'] = unscaled_dists.loc[neighbour_rte_ids]
    chosen_unscaled = process_data.remove_scaling(chosen.drop('dist_to_start', axis=1))
    chosen_unscaled['dist_to_start'] = 0.
else:
    chosen_unscaled = process_data.remove_scaling(chosen)
trips_unscaled.append(chosen_unscaled)


Unnamed: 0,dist,avg_slope_climbing,avg_slope_descending,max_slope,dist_climbing,dist_downhill,dist_6percent,dist_9percent,dist_12percent,detour_score,popularity,dist_to_start
25043103,64.144737,5.531562,-1.662393,15.86953,0.214345,0.165534,0.106311,0.034038,0.003439,0.27717,94.540091,0.397698
43315360,60.784937,5.356316,-1.403017,18.344489,0.186639,0.152792,0.102238,0.031232,0.004851,0.280303,90.308057,0.397698
16944069,89.076969,5.305654,-1.783137,17.473353,0.155671,0.15202,0.078709,0.043516,0.01503,0.291877,74.869087,0.397698
15409814,59.666949,4.332393,-1.341559,13.460252,0.179918,0.13869,0.063654,0.024305,0.002613,0.308856,86.527687,0.397698
1794558,66.199011,4.662143,-1.895591,17.006514,0.146652,0.14354,0.062256,0.032553,0.01055,0.228759,96.436709,0.397698
chosen,4.248638,5.0,-6.0,15.0,0.4,0.4,-1.171183,-2.207275,-2.813411,0.3,70.0,0.0


In [12]:
feature_scaling

{'dist': 1.4241848199263392,
 'avg_slope_climbing': 0.0,
 'avg_slope_descending': 0.8510543703883755,
 'max_slope': 0.6191108468100139,
 'dist_climbing': 0.0,
 'dist_downhill': 0.0,
 'dist_6percent': 0.4696368120089681,
 'dist_9percent': 0.0,
 'dist_12percent': 0.19518107303913015,
 'detour_score': 0.5,
 'popularity': 0.5,
 'dist_to_start': 2.0}

In [7]:
v0, v2, v3, v6, v8, v9, v10 = 0., 0., 0., 0., 0., 0., 0.
chosen = pd.DataFrame([[v0, 0., v2, v3, 0., 0., v6, 0., v8, v9, v10]],
                    columns=presets.columns)
chosen = process_data.engineer_features(chosen)
chosen

NameError: name 'presets' is not defined

In [28]:
r = process_data.plot_NN(
    neighbour_rte_ids, grid_pts_fine, gridpts_at_rte_fine,
    (start_lat, start_lon, start_location_yn),
)
r.to_html('demo.html')


In [None]:
r.

In [26]:
colours = sns.color_palette(["#7A2008", "#d4350b", "#ff5224",  "#b68679", "#df8770"])