In [None]:
!pip install umap-learn

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [12]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import re

import hdbscan
import sklearn.cluster
import sklearn.manifold
import umap
import joblib


from util import get_data
from util import mapping
from util import clean_data
from util import config
from util import plot_clusters

In [5]:
trips = clean_data.load_clean_ridewgps_trips()
trips.head()

Unnamed: 0,id,departed_at,duration,distance,elevation_gain,description,name,avg_speed,max_speed,moving_time,...,update_days,if_updated,elevation_net,elevation_total,avg_slope,photos,big_user,crow_distance,if_weekend,prop_moving
1,54372107,2020-08-15T03:04:55-07:00,0.540556,2.762263,142.176885,,08/15/20,5.171163,5.960829,0.534167,...,0,False,-30.236712,314.590483,2.156983,False,True,0.012711,False,0.988181
3,40433541,2019-09-23T11:49:34-07:00,0.58,2.779512,57.147474,,09/23/19,5.19265,5.432662,0.535278,...,0,False,15.291757,99.003191,0.674601,False,True,0.002539,False,0.922893


In [None]:
def speed_to_label(speed):
    if speed < 10:
        return '< 10 mph'
    if speed < 15:
        return '10-15 mph'
    if speed < 20:
        return '15-20 mph'
    else:
        return '20+ mph'
    

In [None]:
trips['speed'] = trips.avg_speed.apply(speed_to_label)
sns.histplot(data=trips, x='avg_slope', hue='speed', palette='husl', element='step')


In [6]:
# useful_cols = ['duration', 'distance', 'elevation_gain', 'avg_speed',
#               'max_speed', 'if_updated', 'photos', 'big_user',
#               'crow_distance', 'if_weekend', 'prop_moving']

trips['avg_slope'] = trips['elevation_gain'] / trips['distance'] / 52.8
useful_cols = ['distance', 'avg_slope', 'avg_speed', 'prop_moving']
rides_use = trips[useful_cols].copy()
CAP_SLOPE = 10
rides_use['avg_slope'] = rides_use['avg_slope'].apply(lambda x: CAP_SLOPE if CAP_SLOPE < x else x)# cap at 12%
for col in rides_use.columns:
    if rides_use[col].dtype in ['float64']:
        rides_use[col] = rides_use[col] / rides_use[col].max()
rides_use['avg_slope'] *= 2        

In [None]:
plt.figure(figsize=(10,5))
ax = plt.axes()
sns.boxplot(x='labels', y='avg_slope', data=df[df.labels >= 0])

In [None]:
plt.hist(rides_use.avg_slope)

In [2]:
import numpy as np
set(np.array([1, 2, 3]))

{1, 2, 3}

In [None]:

sns.boxplot(data=df, x='labels', y='avg_slope', palette=colours)
# plt.ylim([0, 15])

In [None]:
sns.histplot(data=df, x='duration', hue='labels', element='step', palette='dark')#, palette=colours)

In [None]:
colours = sns.color_palette('Paired', df.labels.nunique() - min(df.labels))
g = sns.PairGrid(df, hue='labels', palette=colours, corner=True)
g.map_offdiag(sns.scatterplot, edgecolor=None, s=10)
g.map_diag(sns.histplot)

In [21]:
# clusterer = hdbscan.HDBSCAN(min_cluster_size=100,
#                             min_samples=1,
#                            cluster_selection_epsilon=0.)
clusterer = sklearn.cluster.KMeans(n_clusters=6)
clusterer.fit(rides_use)
print(len(set(clusterer.labels_)))
df = trips.copy()
df['labels'] = clusterer.labels_
print(df[df['labels'] >= 0].shape)
if df.labels.nunique() < 20:
    print(df.labels.value_counts())

6
(20894, 23)
0    7262
4    4484
3    4235
1    1841
2    1700
5    1372
Name: labels, dtype: int64


In [22]:
joblib.dump(clusterer, config.MODEL_PATH + 'clustered_trips.joblib')

['/home/emily/Documents/ViewFinder/models/clustered_trips.joblib']

In [23]:
clf = joblib.load(config.MODEL_PATH + 'clustered_trips.joblib')

In [27]:
print(clf.predict(rides_use.iloc[0].values.reshape(1, -1)))
print(rides_use.iloc[0])
print(df.iloc[0])

[3]
distance       0.021886
avg_slope      0.175787
avg_speed      0.202650
prop_moving    0.812699
Name: 0, dtype: float64
id                                  23209396
departed_at        2018-04-30T02:50:43-07:00
duration                            0.619444
distance                             2.65985
elevation_gain                       123.437
description                                 
name                                04/30/18
avg_speed                            5.00285
max_speed                            7.04885
moving_time                         0.531667
is_stationary                          False
user_id                                50396
update_days                                0
if_updated                             False
elevation_net                        17.2735
elevation_total                      229.601
avg_slope                           0.878934
photos                                 False
big_user                                True
crow_distance        

In [None]:
mapped_clusters = sklearn.manifold.TSNE(n_components=2, metric='cosine', init='pca').fit_transform(rides_use)
x = mapped_clusters[:, 0]
y = mapped_clusters[:, 1]

In [None]:
clusterer.affinity

In [None]:
df_save = df[['id', 'labels']]
df_save.to_csv('data/processed/ridewgps_labelled.csv', index=False)

In [None]:
reducer = umap.UMAP().fit_transform(rides_use)
xu = reducer[:, 0]
yu = reducer[:, 1]

In [None]:
plot_clusters.plot_cluster_2d(rides_use, clusterer.labels_, xu, yu)

In [None]:
for lab in sorted(df['labels'].unique())[::-1]:
    dfslice = df[df['labels'] == lab]
    print('{:2.0f}: {:5.0f} values'.format(lab, dfslice.shape[0]))
#     for col in dfslice.columns:
#         if col == 'labels': continue
#         print('\t{:15s}: {:.2f} +- {:.2f}'.format(
#             col, dfslice[col].mean() * trips[col].mean(),
#             dfslice[col].std() * trips[col].mean(),
#         ))

In [None]:

for col in df.columns:

    for lab in sorted(df['labels'].unique()):
        dfslice = df[df['labels'] == lab]
#         if dfslice.shape[0] < 400: continue
        print('{:15s} ({:2.0f}): {:.2f} +- {:.2f}'.format(
            col, lab, dfslice[col].mean(), dfslice[col].std(),
        ))
    print('\n')

In [None]:
lab = 1
plt.hist(trips[df['labels'] == lab]['distance'])