# Treating hotspot detection as a classification problem

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read data
df = pd.read_csv(r'./data/NY_Taxi_Rich.csv')

In [3]:
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
from sklearn.cluster import KMeans


def plot_on_ny_map(df, points=10, figsize=(12,12), save=False, title='NYC data', fname='plot.png', cols=['startLongitude', 'startLatitude'], colored=False, color_col=None, size=12):
    data_x = df[cols[0]][:points]
    data_y = df[cols[1]][:points]

    ny = gpd.read_file(gpd.datasets.get_path('nybb'))
    geometry = [Point(x, y) for x, y in zip(data_x, data_y)]
    crs = 'EPSG:4326'
    
    gdf = GeoDataFrame(df[[cols[0], cols[1]]][:points], crs=crs, geometry=geometry)
    
    fig, ax = plt.subplots(figsize = (12,12))
    ax.set_title(title)
    ny.to_crs(epsg=4326).plot(ax=ax, color='lightgrey', edgecolor='black')
    
    if colored:
        if color_col is None:
            raise Exception("color_col arg is missing, but colored is set to True.")
            return
        gdf.plot(ax=ax, markersize=size, c=df[color_col])
    else:
        gdf.plot(ax=ax, markersize=size)
    
    if save:
        plt.savefig(fname)
    else:
        plt.show()

def clean_spatial_data(df, cols=['startLatitude', 'startLongitude'], stddevs=3):
    for col in cols:
        std = df[col].std()
        mean = df[col].mean()
        lower_boundary = mean - stddevs * std
        upper_boundary = mean + stddevs * std
        df = df.loc[(df[col] > lower_boundary) & (df[col] < upper_boundary)]
    return df

def encode_text_fields(df, cols=['season', 'dayName', 'dayPeriod']):
    for col in cols:
        unique_vals = df[col].unique()
        for i in range(len(unique_vals)):
            df.loc[df[col] == unique_vals[i], col] = i
    return df

def label_data(df, n = 120):
    X = df[['startLatitude', 'startLongitude']]
    kmeans = KMeans(n_clusters = n).fit(X)
    labels = pd.DataFrame(kmeans.labels_, columns=['label'])
    labels = labels.set_index(df.index)
    result = df.join(labels)
    return result

def get_training_and_test_data(labeled_df, class_cols=['year', 'month', 'day', 'hour', 'minute', 'second', 'season', 'dayName', 'dayPeriod', 'temperature', 'rain', 'snow'], label_col='label', split = 0.8, random = True):
    rows, _ = labeled_df.shape
    training_rows = int(rows * 0.8)
    
    if random:
        labeled_df = labeled_df.sample(frac = 1.0)
    
    training_data = labeled_df[:training_rows]
    test_data = labeled_df[training_rows:rows]
    
    return training_data[class_cols], training_data['label'], test_data[class_cols], test_data['label']

Basic model idea: The columns **year**, **month**, **day**, **hour**, **minute**, **second**, **season (encoded)**, **dayName (encoded)**, **dayPeriod (encoded)**, **temperature**, **rain** and **snow**, determine the class i. e. the location of the trip. Since the classifier will try to predict a probability for each class at the same time it outputs the probability for any location directly. A probability abova a certain threshold can be considered a hotspot. 

Further Data insights below, show there are no invalid values in the relevant columns. There is therfore no need to preprocess the data any further at the moment.

In [4]:
print('Year: \t', df['year'].unique())
value = df['month'].unique()
value.sort()
print('Month: \t', value)
value = df['day'].unique()
value.sort()
print('Day: \t', value)
value = df['hour'].unique()
value.sort()
print('Hour: \t', value)
print('Rain: \t', df['rain'].unique())
print('Snow: \t', df['snow'].unique())

Year: 	 [2016]
Month: 	 [1 2 3 4 5 6]
Day: 	 [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]
Hour: 	 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
Rain: 	 [1 0]
Snow: 	 [0 1]


In [5]:
# get data for a slice of df

df3 = df[:20000]
df3.head()
df3 = label_data(df3)
df3 = encode_text_fields(df3)
X_tr, y_tr, X_test, y_test = get_training_and_test_data(df3)

In [6]:
# set up SVM as classfier

from sklearn import svm

clf = svm.SVC(kernel='poly')
clf.decision_function_shape = "ovr"
clf.probability = True

clf.fit(X_tr, y_tr)
predictions = clf.predict(X_test)
probabilities = clf.predict_proba(X_test)

In [7]:
probabilities[0]

array([0.00759351, 0.01939613, 0.00852676, 0.01074235, 0.01104069,
       0.00369098, 0.01585716, 0.01315833, 0.00107017, 0.02634778,
       0.01460706, 0.00202821, 0.00486618, 0.00872068, 0.00225122,
       0.01505428, 0.01207196, 0.01656893, 0.01232817, 0.02304663,
       0.01246093, 0.00040374, 0.00023088, 0.01020624, 0.01473615,
       0.00148761, 0.0087028 , 0.01231541, 0.00221089, 0.00119856,
       0.00046724, 0.001327  , 0.01256582, 0.01914482, 0.00159776,
       0.01375627, 0.01228814, 0.01320093, 0.00168494, 0.00927564,
       0.0006076 , 0.00011573, 0.01125696, 0.00961119, 0.00943441,
       0.00126004, 0.01044554, 0.00437813, 0.01033246, 0.01087538,
       0.01291434, 0.00013412, 0.01229846, 0.01810545, 0.0009958 ,
       0.0226185 , 0.00296616, 0.01933703, 0.02309613, 0.00055392,
       0.01070386, 0.01029328, 0.00047874, 0.01112175, 0.00077048,
       0.01971891, 0.01034809, 0.01509344, 0.01628585, 0.00569401,
       0.00284521, 0.01220359, 0.01146758, 0.01387692, 0.01979