In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geohash

In [153]:
featuredDataset = pd.read_csv('featured-dataset.csv')
featuredDataset = featuredDataset.drop(featuredDataset.columns[0], axis=1)
featuredDataset.head(5)

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,location_start,location_end
0,2017,3,22,16.833333333333332:49.999999999999886,0.746528,-0.021815,-0.999762,Wednesday,0.392361,-0.779884,0.625923,0,eyckx6,eycs8b
1,2017,5,24,12.333333333333334:20.0,0.538194,-0.971342,-0.237686,Wednesday,0.362599,-0.649922,0.760001,0,u0qjdq,u0qjf9
2,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9
3,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9
4,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9


In [154]:
featuredDataset.shape

(1537, 14)

### Feature extraction

In [155]:

# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):  

    df['start_lat'] = df['location_start'].apply(lambda geo: decodegeo(geo, 0))
    df['start_lon'] = df['location_start'].apply(lambda geo: decodegeo(geo, 1))
    df['end_lat'] = df['location_end'].apply(lambda geo: decodegeo(geo, 0))
    df['end_lon'] = df['location_end'].apply(lambda geo: decodegeo(geo, 1))
    
    return df

In [156]:
featuredDataset = further_data_prep(featuredDataset)
featuredDataset.head(5)

Unnamed: 0,year,month,day,time_cat,time_num,time_cos,time_sin,day_cat,day_num,day_cos,day_sin,weekend,location_start,location_end,start_lat,start_lon,end_lat,end_lon
0,2017,3,22,16.833333333333332:49.999999999999886,0.746528,-0.021815,-0.999762,Wednesday,0.392361,-0.779884,0.625923,0,eyckx6,eycs8b,38.773499,-9.168091,38.762512,-9.102173
1,2017,5,24,12.333333333333334:20.0,0.538194,-0.971342,-0.237686,Wednesday,0.362599,-0.649922,0.760001,0,u0qjdq,u0qjf9,47.408752,8.54187,47.425232,8.552856
2,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9,47.408752,8.54187,47.425232,8.552856
3,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9,47.408752,8.54187,47.425232,8.552856
4,2017,5,24,12.35:21.0,0.539583,-0.969231,-0.246153,Wednesday,0.362798,-0.650869,0.75919,0,u0qjdq,u0qjf9,47.408752,8.54187,47.425232,8.552856


### Train-test split

In [157]:
columns_all_features = featuredDataset.columns
columns_X = ['time_num', 'time_sin', 'time_cos', 'day_num', 'start_lat', 'start_lon']
columns_y = ['end_lat', 'end_lon']
X = featuredDataset[columns_X]
y = featuredDataset[columns_y]

In [158]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [159]:
print ('X: ({}, {})'.format(*X.shape))
print ('y: ({}, {})'.format(*y.shape))
print ('X_train: ({}, {})'.format(*X_train.shape))
print ('y_train: ({}, {})'.format(*y_train.shape))
print ('X_test: ({}, {})'.format(*X_test.shape))
print ('y_test: ({}, {})'.format(*y_test.shape))

X: (1537, 6)
y: (1537, 2)
X_train: (1229, 6)
y_train: (1229, 2)
X_test: (308, 6)
y_test: (308, 2)


### Machine Learning

In [160]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [162]:
reg = RandomForestRegressor(n_estimators=1, max_depth=20, verbose=4, warm_start=True)
best_rmse = 0
for n in range(1,40):
   reg.set_params(n_estimators=n)
   reg.fit(X_train,y_train) #, sample_weight=pickup_count_train.values)
   training_accuracy = reg.score(X_train, y_train)
   valid_accuracy = reg.score(X_test, y_test)
   rmsetrain = np.sqrt(mean_squared_error(reg.predict(X_train),y_train))
   rmsevalid = np.sqrt(mean_squared_error(reg.predict(X_test),y_test))
   best_rmse = rmsevalid
   print (" R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]

building tree 1 of 1
 R^2 (train) = 1.000, R^2 (valid) = 1.000, RMSE (train) = 0.013, RMSE (valid) = 0.015
building tree 1 of 1
 R^2 (train) = 0.999, R^2 (valid) = 1.000, RMSE (train) = 0.199, RMSE (valid) = 0.013
building tree 1 of 1
 R^2 (train) = 0.999, R^2 (valid) = 1.000, RMSE (train) = 0.265, RMSE (valid) = 0.020
building tree 1 of 1
 R^2 (train) = 0.999, R^2 (valid) = 1.000, RMSE (train) = 0.199, RMSE (valid) = 0.020
building tree 1 of 1
 R^2 (train) = 0.999, R^2 (valid) = 1.000, RMSE (train) = 0.238, RMSE (valid) = 0.021
building tree 1 of 1
 R^2 (train) = 1.000, R^2 (valid) = 1.000, RMSE (train) = 0.135, RMSE (valid) = 0.020
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.996, RMSE (train) = 0.546, RMSE (valid) = 0.630
building tree 1 of 1
 R^2 (train) = 0.989, R^2 (valid) = 0.981, RMSE (train) = 0.896, RMSE (valid) = 1.436
building tree 1 of 1
 R^2 (train) = 0.991, R^2 (valid) = 0.985, RMSE (train) = 0.797, RMSE (valid) = 1.278
building tree 1 of 1
 R^2 (train) = 0

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining

 R^2 (train) = 0.997, R^2 (valid) = 0.995, RMSE (train) = 0.499, RMSE (valid) = 0.725
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0.996, RMSE (train) = 0.462, RMSE (valid) = 0.684
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0.996, RMSE (train) = 0.436, RMSE (valid) = 0.647
building tree 1 of 1
 R^2 (train) = 0.998, R^2 (valid) = 0.997, RMSE (train) = 0.413, RMSE (valid) = 0.614
building tree 1 of 1
 R^2 (train) = 0.998, R^2 (valid) = 0.997, RMSE (train) = 0.387, RMSE (valid) = 0.584
building tree 1 of 1
 R^2 (train) = 0.998, R^2 (valid) = 0.997, RMSE (train) = 0.369, RMSE (valid) = 0.557
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.992, RMSE (train) = 0.523, RMSE (valid) = 0.909
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.993, RMSE (train) = 0.522, RMSE (valid) = 0.871
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0.994, RMSE (train) = 0.500, RMSE (valid) = 0.835
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining

 R^2 (train) = 0.997, R^2 (valid) = 0.995, RMSE (train) = 0.462, RMSE (valid) = 0.772
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0.995, RMSE (train) = 0.445, RMSE (valid) = 0.744
building tree 1 of 1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining

 R^2 (train) = 0.996, R^2 (valid) = 0.990, RMSE (train) = 0.530, RMSE (valid) = 1.020
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.991, RMSE (train) = 0.531, RMSE (valid) = 0.985
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.992, RMSE (train) = 0.513, RMSE (valid) = 0.953
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.992, RMSE (train) = 0.520, RMSE (valid) = 0.922
building tree 1 of 1
 R^2 (train) = 0.997, R^2 (valid) = 0.993, RMSE (train) = 0.504, RMSE (valid) = 0.894
building tree 1 of 1
 R^2 (train) = 0.995, R^2 (valid) = 0.988, RMSE (train) = 0.576, RMSE (valid) = 1.125
building tree 1 of 1
 R^2 (train) = 0.995, R^2 (valid) = 0.989, RMSE (train) = 0.581, RMSE (valid) = 1.092
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.990, RMSE (train) = 0.564, RMSE (valid) = 1.061
building tree 1 of 1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining

 R^2 (train) = 0.996, R^2 (valid) = 0.990, RMSE (train) = 0.549, RMSE (valid) = 1.032
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.991, RMSE (train) = 0.557, RMSE (valid) = 1.005
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.991, RMSE (train) = 0.542, RMSE (valid) = 0.979
building tree 1 of 1
 R^2 (train) = 0.996, R^2 (valid) = 0.992, RMSE (train) = 0.528, RMSE (valid) = 0.955


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining

In [163]:
print("RMSE of the model: ", best_rmse)

RMSE of the model:  0.954509525118


In [174]:
sampleds = pd.DataFrame(featuredDataset, columns=(columns_X + columns_y))
sampleds = sampleds.sample(10)
sampleds

Unnamed: 0,time_num,time_sin,time_cos,day_num,start_lat,start_lon,end_lat,end_lon
843,0.685417,-0.918791,-0.394744,0.383631,47.408752,8.54187,47.425232,8.552856
733,0.670139,-0.876727,-0.480989,0.381448,47.408752,8.54187,47.425232,8.552856
213,0.564583,-0.394744,-0.918791,0.366369,47.408752,8.54187,47.425232,8.552856
338,0.586806,-0.518773,-0.854912,0.369544,47.408752,8.54187,47.425232,8.552856
512,0.640972,-0.774393,-0.632705,0.377282,47.408752,8.54187,47.425232,8.552856
513,0.640972,-0.774393,-0.632705,0.377282,47.408752,8.54187,47.425232,8.552856
986,0.707639,-0.964787,-0.263031,0.386806,47.408752,8.54187,47.425232,8.552856
395,0.597917,-0.577145,-0.816642,0.371131,47.408752,8.54187,47.425232,8.552856
1527,0.020139,0.126199,0.992005,0.288591,32.989197,-97.267456,32.857361,-97.300415
1333,0.71875,-0.980785,-0.19509,0.388393,47.408752,8.54187,47.425232,8.552856


In [175]:
y_pred = reg.predict(sampleds.iloc[:,:-2])
y_pred

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:    0.0s finished


array([[ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 47.42523193,   8.55285645],
       [ 33.28089787, -94.4338285 ],
       [ 47.42523193,   8.55285645]])

### Save the model
We save the tra

In [176]:
from sklearn.externals import joblib
joblib.dump(reg, 'trained_model.pkl') 

['trained_model.pkl']