# Data Science Take Home

In [None]:
from geopy.distance import great_circle
import logging
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pykalman import KalmanFilter
import pylab
from shapely.geometry import MultiPoint
from sklearn.cluster import DBSCAN
import sys

pylab.rcParams['figure.figsize'] = (13, 10)

In [None]:
# Configure logging.
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(process)d/%(threadName)s - %(name)s - %(levelname)s - %(message)s',
                    stream=sys.stdout)
logger = logging.getLogger('main()')

## Take a look at the input data

In [None]:
# Make time the index.

df = pd.read_csv('../../../data/TakeHomeData.txt',
                 parse_dates=[2],
                 sep='\t').set_index(['time']).sort_index()

df.head()

In [None]:
# How many sessionid's are unique?
sessionids = df['sessionid'].unique()
sessionids

In [None]:
# How many installid's are unique?
installids = df['installid'].unique()
installids

In [None]:
# Make sure that the time is a timestamp.
type(df.index[0])

In [None]:
# Follow 1 sessionid.
sessionid_test = sessionids[5]
df[ df['sessionid']==sessionid_test ].head()

In [None]:
df_sessionid_01 = df[ df['sessionid']==sessionid_test ]
plt.subplot(2, 1, 1)
plt.plot(df_sessionid_01.index, df_sessionid_01['lat'], 'x-')
plt.title('')
plt.ylabel('lat (degrees)')

plt.subplot(2, 1, 2)
plt.plot(df_sessionid_01.index, df_sessionid_01['lng'], 'x-')
plt.title('')
plt.xlabel('Datetime')
plt.ylabel('lng (degrees)')

plt.show()

## Test the pykalman library

In [None]:
kf = KalmanFilter(transition_matrices = [[1, 1], [0, 1]], observation_matrices = [[0.1, 0.5], [-0.3, 0.0]])
measurements = np.asarray([[1,0], [0,0], [0,1]])  # 3 observations
kf = kf.em(measurements, n_iter=5)
(filtered_state_means, filtered_state_covariances) = kf.filter(measurements)
(smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements)

smoothed_state_means

## Apply the Kalman filter to the given problem

In [None]:
# Follow 1 sessionid.
sessionid_test = sessionids[5]
df_sessionid_01 = df[ df['sessionid']==sessionid_test ]

delta_t = 1
F = [[1, 0, delta_t, 0], [0, 1, 0, delta_t], [0, 0, 1, 0], [0, 0, 0, 1]]
H = [[1, 0, 0, 0], [0, 1, 0, 0]]
measurements = df_sessionid_01.as_matrix(['lat', 'lng'])

kf = KalmanFilter(n_dim_state=4,
                  n_dim_obs=2,
                  transition_matrices=F,
                  observation_matrices=H,
                  initial_state_mean=[measurements[0, 0], measurements[0, 1], 0, 0])

kf = kf.em(measurements, n_iter=5)
(filtered_state_means, filtered_state_covariances) = kf.filter(measurements)
(smoothed_state_means, smoothed_state_covariances) = kf.smooth(measurements)

In [None]:
plt.subplot(5, 1, 1)
plt.plot(df_sessionid_01.index, df_sessionid_01['lat'], 'x',
         df_sessionid_01.index, smoothed_state_means[:, 0], '--')
plt.title('sessionid = ' + str(sessionid_test))
plt.ylabel('lat (degrees)')

plt.subplot(5, 1, 2)
plt.plot(df_sessionid_01.index, smoothed_state_means[:, 2], '--')
plt.ylabel('lat velocity')

plt.subplot(5, 1, 3)
plt.plot(df_sessionid_01.index, df_sessionid_01['lng'], 'x',
         df_sessionid_01.index, smoothed_state_means[:, 1], '--')
plt.ylabel('lng (degrees)')

plt.subplot(5, 1, 4)
plt.plot(df_sessionid_01.index, smoothed_state_means[:, 3], '--')
plt.ylabel('lng velocity')

v_threshold = 0.00025
b_stop = np.all([np.fabs(smoothed_state_means[:, 2]) < v_threshold,
                 np.fabs(smoothed_state_means[:, 3]) < v_threshold], axis=0)

plt.subplot(5, 1, 5)
plt.plot(df_sessionid_01.index, b_stop, '--')
plt.ylabel('Stopped?')
plt.xlabel('Datetime')

plt.show()

## Write code to extract stopping locations

In [None]:
# installid describes a user, while sessionid describes a session with a user.
class LocationTracker(object):
    def __init__(self, input_file):
        self.input_file = input_file
        self.df = pd.read_csv(self.input_file,
                              parse_dates=[2],
                              sep='\t').set_index(['time']).sort_index()
        self.sessionids = self.df['sessionid'].unique()
    
    def apply_Kalman_filter(self):
        pass
    
    def find_stopping_locations(self):
        # Need to output a .csv file with each row containing the following information for each stop:
        #   installid, start time, end time, latitude, longitude.
        pass
        

In [None]:
location_tracker = LocationTracker(input_file='../../../data/TakeHomeData.txt')

## Use the DBSCAN algorithm
See
  http://geoffboeing.com/2014/08/clustering-to-reduce-spatial-data-set-size/

In [None]:
installid_test = installids[0]
coords = df[ df['installid']==installid_test ].as_matrix(columns=['lat', 'lng'])

coords

In [None]:
plt.plot(coords[:, 0], coords[:, 1], 'x')
plt.xlabel('lat')
plt.ylabel('lng')
plt.title('installid =' + str(installid_test))

plt.show()

In [None]:
km_per_radian = 6371.0088
epsilon = 0.04 / km_per_radian
db = DBSCAN(eps=epsilon,
            min_samples=4,
            algorithm='ball_tree',
            metric='haversine').fit(np.radians(coords))

cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

In [None]:
clusters.head()

In [None]:
plt.plot(coords[:, 0], coords[:, 1], '.')
plt.xlabel('lat')
plt.ylabel('lng')
plt.title('installid =' + str(installid_test))

for cluster in clusters:
    try:
      plt.plot( MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y, 'ro')
    except Exception as e:
        logger.exception( 'Caught exception ' + str(e) )
plt.show()

In [None]:
MultiPoint(clusters[2]).centroid.y