In [1]:
%matplotlib inline

import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime


### Goal: Explore the possiblity of predicting a destination (lat/long) based on the pickup location, time & day

## 1. Load the aggregated data from the CSV file

In [19]:
# Each line is of the format:

#pickupGeohash, dropOffGeohash,time_num,day_of_week, count
# File is also available here: https://s3.amazonaws.com/testsetu/nyc/final/groupbydestn/singlefile/part-00000
names = ["pickup_geohash","dropoff_geohash","time_num","day_of_week", "count"]
df=pd.read_csv("./tmplocaldata/final/singlefile/part-00000", header=None, names = names)
#df = df.sample(n=10000000,weights=df['count'], replace=True)
print df.shape

(15285988, 5)


## 2. Feature Extraction

In [57]:
# Get the longitude and latitude from the geohash
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
    
def further_data_prep(df):
  
    df['time_sin'] = (df['time_num'] * 2 * math.pi).apply(math.sin)
    df['time_cos'] = (df['time_num'] * 2 * math.pi).apply(math.cos)
    df['pickup_lat'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['pickup_long'] = df['pickup_geohash'].apply(lambda geo: decodegeo(geo, 1))
    df['dropoff_lat'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['dropoff_long'] = df['dropoff_geohash'].apply(lambda geo: decodegeo(geo, 1))
    
    return df

In [58]:
df = further_data_prep(df)

## 3. Train-test split

In [59]:
trainSetSampleSize = 10000000
testSetSize = 2000000
testSetPosns = np.random.choice(df.shape[0],testSetSize, replace = False)

In [60]:
testSet = df.iloc[testSetPosns]
trainSet = df.drop(testSetPosns)

In [61]:
print df.shape
print trainSet.shape
print testSet.shape

(15285988, 11)
(13285988, 11)
(2000000, 11)


In [62]:
#sample with replacement
trainSet = trainSet.sample(n=trainSetSampleSize,weights=trainSet['count'], replace=True)

In [63]:
X_train = trainSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'count']]
y_train = trainSet[['dropoff_lat', 'dropoff_long']]

X_test = testSet[['time_num', 'time_sin', 'time_cos','day_of_week', 'pickup_lat', 'pickup_long', 'count']]
y_test = testSet[['dropoff_lat', 'dropoff_long']]

In [64]:
pickup_count_train = X_train[['count']]
X_train.drop('count', axis=1, inplace=True)

pickup_count_test = X_test[['count']]
X_test.drop('count', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 4. Machine learning

In [72]:
reg = RandomForestRegressor(n_estimators=1, max_depth=20, n_jobs=-1, verbose=4, warm_start=True)
for n in range(1,20):
   reg.set_params(n_estimators=n)
   reg.fit(X_train,y_train) #, sample_weight=pickup_count_train.values)
   training_accuracy = reg.score(X_train, y_train)
   valid_accuracy = reg.score(X_test, y_test)
   rmsetrain = np.sqrt(mean_squared_error(reg.predict(X_train),y_train))
   rmsevalid = np.sqrt(mean_squared_error(reg.predict(X_test),y_test))
   print " R^2 (train) = %0.3f, R^2 (valid) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f" % (training_accuracy, valid_accuracy, rmsetrain, rmsevalid)

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.5s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    2.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.4s


building tree 1 of 1
 R^2 (train) = 0.234, R^2 (valid) = -0.057, RMSE (train) = 0.036, RMSE (valid) = 0.128

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.2s finished
[Parallel(n_jobs=2)]: Done   1 out of   2 | elapsed:    2.0s remaining:    2.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.0s finished
[Parallel(n_jobs=2)]: Done   1 out of   2 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s finished
[Parallel(n_jobs=2)]: Done   1 out of   2 | elapsed:    2.1s remaining:    2.1s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.1s finished
[Parallel(n_jobs=2)]: Done   1 out of   2 | elapsed:    0.5s remaining:    0.5s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    0.5s finished



building tree 1 of 1
 R^2 (train) = 0.275, R^2 (valid) = 0.016, RMSE (train) = 0.035, RMSE (valid) = 0.123

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.4s finished
[Parallel(n_jobs=3)]: Done   2 out of   3 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=3)]: Done   1 out of   3 | elapsed:    2.3s remaining:    4.7s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.3s finished
[Parallel(n_jobs=3)]: Done   1 out of   3 | elapsed:    0.5s remaining:    1.2s
[Parallel(n_jobs=3)]: Done   2 out of   3 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    0.6s finished
[Parallel(n_jobs=3)]: Done   2 out of   3 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=3)]: Done   1 out of   3 | elapsed:    2.3s remaining:    4.7s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:    2.3s finished
[Parallel(n_jobs=3)]: Done   2 out of   3 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=3)]: Done   1 out of   3 | elapsed:    0.6s remaining:    1.3


building tree 1 of 1
 R^2 (train) = 0.283, R^2 (valid) = 0.035, RMSE (train) = 0.035, RMSE (valid) = 0.122

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   42.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   42.0s finished
[Parallel(n_jobs=4)]: Done   1 out of   4 | elapsed:    2.7s remaining:    8.3s
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.7s remaining:    2.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.6s remaining:    0.6s
[Parallel(n_jobs=4)]: Done   1 out of   4 | elapsed:    0.7s remaining:    2.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.5s remaining:    2.5s
[Parallel(n_jobs=4)]: Done   1 out of   4 | elapsed:    2.6s remaining:    7.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.6s finished



building tree 1 of 1
 R^2 (train) = 0.286, R^2 (valid) = 0.055, RMSE (train) = 0.035, RMSE (valid) = 0.121

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.7s remaining:    0.7s
[Parallel(n_jobs=4)]: Done   1 out of   4 | elapsed:    0.7s remaining:    2.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.2s finished
[Parallel(n_jobs=5)]: Done   1 out of   5 | elapsed:    2.9s remaining:   11.8s
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:    2.9s remaining:    1.9s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    3.0s finished
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:    0.7s remaining:    0.5s
[Parallel(n_jobs=5)]: Done   1 out of   5 | elapsed:    0.8s remaining:    3.4s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.8s finished
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:    2.9s remaining:    1.9s
[Parallel(n_jobs=5)]: Done   1 out of   5 | elapsed:    3.0s remaining:   12.3


building tree 1 of 1
 R^2 (train) = 0.286, R^2 (valid) = 0.068, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:    0.7s remaining:    0.5s
[Parallel(n_jobs=5)]: Done   1 out of   5 | elapsed:    0.7s remaining:    3.3s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s finished
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    2.8s remaining:    5.7s
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    2.9s remaining:    1.4s
[Parallel(n_jobs=6)]: Done   1 out of   6 | elapsed:    3.0s remaining:   15.5s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    3.1s finished
[Parallel(n_jobs=6)]: Done   4 out of   6 | elapsed:    0.7s remaining:    0.3s
[Parallel(n_jobs=6)]: Done   2 out of   6 | elapsed:    0.7s remaining:    1.5s
[Parallel(n_jobs=6)]: Done   1 out of   6 | elapsed:    0.8s remaining:    4.4s
[Parallel(n_jobs=6)]: Done   6 out of   6 | elapsed:    0.9s finishe


building tree 1 of 1
 R^2 (train) = 0.288, R^2 (valid) = 0.068, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.9s finished
[Parallel(n_jobs=7)]: Done   3 out of   7 | elapsed:    3.4s remaining:    4.6s
[Parallel(n_jobs=7)]: Done   1 out of   7 | elapsed:    3.4s remaining:   21.2s
[Parallel(n_jobs=7)]: Done   5 out of   7 | elapsed:    3.5s remaining:    1.3s
[Parallel(n_jobs=7)]: Done   7 out of   7 | elapsed:    3.5s finished
[Parallel(n_jobs=7)]: Done   3 out of   7 | elapsed:    0.8s remaining:    1.1s
[Parallel(n_jobs=7)]: Done   5 out of   7 | elapsed:    0.9s remaining:    0.3s
[Parallel(n_jobs=7)]: Done   1 out of   7 | elapsed:    0.9s remaining:    5.9s
[Parallel(n_jobs=7)]: Done   7 out of   7 | elapsed:    0.9s finished
[Parallel(n_jobs=7)]: Done   3 out of   7 | elapsed:    3.4s remaining:    4.5s
[Parallel(n_jobs=7)]: Done   1 out of   7 | elapsed:    3.4s remaining:   20.8s
[Parallel(n_jobs=7)]: Done   5 out of   7 | elapsed:    3.4s remaini


building tree 1 of 1
 R^2 (train) = 0.288, R^2 (valid) = 0.069, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.5s finished
[Parallel(n_jobs=8)]: Done   7 out of   8 | elapsed:    3.5s remaining:    0.4s
[Parallel(n_jobs=8)]: Done   4 out of   8 | elapsed:    3.9s remaining:    3.9s
[Parallel(n_jobs=8)]: Done   1 out of   8 | elapsed:    4.2s remaining:   30.3s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    4.2s finished
[Parallel(n_jobs=8)]: Done   4 out of   8 | elapsed:    0.9s remaining:    0.9s
[Parallel(n_jobs=8)]: Done   1 out of   8 | elapsed:    1.0s remaining:    7.5s
[Parallel(n_jobs=8)]: Done   7 out of   8 | elapsed:    1.0s remaining:    0.1s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Done   4 out of   8 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=8)]: Done   1 out of   8 | elapsed:    3.9s remaining:   27.7s
[Parallel(n_jobs=8)]: Done   7 out of   8 | elapsed:    3.9s remaini


building tree 1 of 1
 R^2 (train) = 0.289, R^2 (valid) = 0.069, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s finished
[Parallel(n_jobs=8)]: Done   1 out of   9 | elapsed:    3.7s remaining:   30.7s
[Parallel(n_jobs=8)]: Done   8 out of   9 | elapsed:    3.9s remaining:    0.4s
[Parallel(n_jobs=8)]: Done   2 out of   9 | elapsed:    3.9s remaining:   13.8s
[Parallel(n_jobs=8)]: Done   5 out of   9 | elapsed:    3.9s remaining:    3.1s
[Parallel(n_jobs=8)]: Done   9 out of   9 | elapsed:    5.9s finished
[Parallel(n_jobs=8)]: Done   2 out of   9 | elapsed:    1.0s remaining:    3.6s
[Parallel(n_jobs=8)]: Done   5 out of   9 | elapsed:    1.0s remaining:    0.8s
[Parallel(n_jobs=8)]: Done   8 out of   9 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   1 out of   9 | elapsed:    1.0s remaining:    8.6s
[Parallel(n_jobs=8)]: Done   9 out of   9 | elapsed:    1.4s finished
[Parallel(n_jobs=8)]: Done   2 out of   9 | elapsed:    3.7s remaini


building tree 1 of 1
 R^2 (train) = 0.289, R^2 (valid) = 0.067, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.8s finished
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    3.9s remaining:    2.5s
[Parallel(n_jobs=8)]: Done   1 out of  10 | elapsed:    4.0s remaining:   36.4s
[Parallel(n_jobs=8)]: Done   3 out of  10 | elapsed:    4.2s remaining:    9.9s
[Parallel(n_jobs=8)]: Done   9 out of  10 | elapsed:    6.0s remaining:    0.6s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    6.2s finished
[Parallel(n_jobs=8)]: Done   6 out of  10 | elapsed:    0.9s remaining:    0.6s
[Parallel(n_jobs=8)]: Done   1 out of  10 | elapsed:    0.9s remaining:    9.3s
[Parallel(n_jobs=8)]: Done   3 out of  10 | elapsed:    1.2s remaining:    3.0s
[Parallel(n_jobs=8)]: Done   9 out of  10 | elapsed:    1.6s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    1.6s finished
[Parallel(n_jobs=8)]: Done   3 out of  10 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.289, R^2 (valid) = 0.065, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.5s finished
[Parallel(n_jobs=8)]: Done   4 out of  11 | elapsed:    3.5s remaining:    6.1s
[Parallel(n_jobs=8)]: Done   1 out of  11 | elapsed:    3.7s remaining:   38.4s
[Parallel(n_jobs=8)]: Done   7 out of  11 | elapsed:    3.8s remaining:    2.1s
[Parallel(n_jobs=8)]: Done  10 out of  11 | elapsed:    6.0s remaining:    0.5s
[Parallel(n_jobs=8)]: Done  11 out of  11 | elapsed:    6.1s finished
[Parallel(n_jobs=8)]: Done   4 out of  11 | elapsed:    0.9s remaining:    1.6s
[Parallel(n_jobs=8)]: Done   7 out of  11 | elapsed:    0.9s remaining:    0.5s
[Parallel(n_jobs=8)]: Done   1 out of  11 | elapsed:    1.0s remaining:   10.4s
[Parallel(n_jobs=8)]: Done  10 out of  11 | elapsed:    1.6s remaining:    0.1s
[Parallel(n_jobs=8)]: Done  11 out of  11 | elapsed:    1.6s finished
[Parallel(n_jobs=8)]: Done   1 out of  11 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.290, R^2 (valid) = 0.063, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.4s finished
[Parallel(n_jobs=8)]: Done   6 out of  12 | elapsed:    3.7s remaining:    3.7s
[Parallel(n_jobs=8)]: Done   1 out of  12 | elapsed:    3.8s remaining:   42.5s
[Parallel(n_jobs=8)]: Done   2 out of  12 | elapsed:    3.8s remaining:   19.3s
[Parallel(n_jobs=8)]: Done  10 out of  12 | elapsed:    6.4s remaining:    1.2s
[Parallel(n_jobs=8)]: Done  12 out of  12 | elapsed:    6.5s finished
[Parallel(n_jobs=8)]: Done   1 out of  12 | elapsed:    0.9s remaining:   11.2s
[Parallel(n_jobs=8)]: Done   2 out of  12 | elapsed:    1.0s remaining:    5.2s
[Parallel(n_jobs=8)]: Done   6 out of  12 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=8)]: Done  10 out of  12 | elapsed:    1.7s remaining:    0.3s
[Parallel(n_jobs=8)]: Done  12 out of  12 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done   6 out of  12 | elapsed:    3.6s remaini


building tree 1 of 1
 R^2 (train) = 0.290, R^2 (valid) = 0.063, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.7s finished
[Parallel(n_jobs=8)]: Done   3 out of  13 | elapsed:    3.7s remaining:   12.7s
[Parallel(n_jobs=8)]: Done   7 out of  13 | elapsed:    3.8s remaining:    3.2s
[Parallel(n_jobs=8)]: Done   1 out of  13 | elapsed:    4.0s remaining:   48.9s
[Parallel(n_jobs=8)]: Done  11 out of  13 | elapsed:    6.8s remaining:    1.2s
[Parallel(n_jobs=8)]: Done  13 out of  13 | elapsed:    6.8s finished
[Parallel(n_jobs=8)]: Done   3 out of  13 | elapsed:    0.9s remaining:    3.2s
[Parallel(n_jobs=8)]: Done   7 out of  13 | elapsed:    1.0s remaining:    0.8s
[Parallel(n_jobs=8)]: Done   1 out of  13 | elapsed:    1.0s remaining:   12.5s
[Parallel(n_jobs=8)]: Done  11 out of  13 | elapsed:    1.7s remaining:    0.2s
[Parallel(n_jobs=8)]: Done  13 out of  13 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done   1 out of  13 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.062, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.0s finished
[Parallel(n_jobs=8)]: Done   4 out of  14 | elapsed:    3.7s remaining:    9.4s
[Parallel(n_jobs=8)]: Done   8 out of  14 | elapsed:    3.8s remaining:    2.8s
[Parallel(n_jobs=8)]: Done   1 out of  14 | elapsed:    3.8s remaining:   51.1s
[Parallel(n_jobs=8)]: Done  12 out of  14 | elapsed:    6.9s remaining:    1.1s
[Parallel(n_jobs=8)]: Done  14 out of  14 | elapsed:    7.0s finished
[Parallel(n_jobs=8)]: Done   4 out of  14 | elapsed:    0.9s remaining:    2.3s
[Parallel(n_jobs=8)]: Done   1 out of  14 | elapsed:    0.9s remaining:   13.1s
[Parallel(n_jobs=8)]: Done   8 out of  14 | elapsed:    1.0s remaining:    0.7s
[Parallel(n_jobs=8)]: Done  12 out of  14 | elapsed:    1.7s remaining:    0.2s
[Parallel(n_jobs=8)]: Done  14 out of  14 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Done   4 out of  14 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.063, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   40.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   40.8s finished
[Parallel(n_jobs=8)]: Done   1 out of  15 | elapsed:    3.7s remaining:   52.7s
[Parallel(n_jobs=8)]: Done   5 out of  15 | elapsed:    3.8s remaining:    7.6s
[Parallel(n_jobs=8)]: Done   9 out of  15 | elapsed:    7.1s remaining:    4.7s
[Parallel(n_jobs=8)]: Done  13 out of  15 | elapsed:    7.1s remaining:    1.0s
[Parallel(n_jobs=8)]: Done  15 out of  15 | elapsed:    7.2s finished
[Parallel(n_jobs=8)]: Done   5 out of  15 | elapsed:    1.0s remaining:    2.1s
[Parallel(n_jobs=8)]: Done   1 out of  15 | elapsed:    1.0s remaining:   15.1s
[Parallel(n_jobs=8)]: Done   9 out of  15 | elapsed:    1.8s remaining:    1.1s
[Parallel(n_jobs=8)]: Done  13 out of  15 | elapsed:    1.9s remaining:    0.2s
[Parallel(n_jobs=8)]: Done  15 out of  15 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Done   1 out of  15 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.063, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   41.1s finished
[Parallel(n_jobs=8)]: Done   8 out of  16 | elapsed:    3.8s remaining:    3.8s
[Parallel(n_jobs=8)]: Done   1 out of  16 | elapsed:    3.9s remaining:  1.0min
[Parallel(n_jobs=8)]: Done   3 out of  16 | elapsed:    4.1s remaining:   18.1s
[Parallel(n_jobs=8)]: Done  13 out of  16 | elapsed:    7.5s remaining:    1.6s
[Parallel(n_jobs=8)]: Done  16 out of  16 | elapsed:    7.5s finished
[Parallel(n_jobs=8)]: Done   3 out of  16 | elapsed:    1.0s remaining:    4.6s
[Parallel(n_jobs=8)]: Done   1 out of  16 | elapsed:    1.0s remaining:   16.1s
[Parallel(n_jobs=8)]: Done   8 out of  16 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=8)]: Done  13 out of  16 | elapsed:    1.9s remaining:    0.4s
[Parallel(n_jobs=8)]: Done  16 out of  16 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Done   3 out of  16 | elapsed:    3.7s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.065, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.4s finished
[Parallel(n_jobs=8)]: Done   4 out of  17 | elapsed:    3.5s remaining:   11.7s
[Parallel(n_jobs=8)]: Done   1 out of  17 | elapsed:    3.8s remaining:  1.0min
[Parallel(n_jobs=8)]: Done   9 out of  17 | elapsed:    7.1s remaining:    6.3s
[Parallel(n_jobs=8)]: Done  14 out of  17 | elapsed:    7.5s remaining:    1.5s
[Parallel(n_jobs=8)]: Done  17 out of  17 | elapsed:    9.3s finished
[Parallel(n_jobs=8)]: Done   4 out of  17 | elapsed:    0.8s remaining:    2.8s
[Parallel(n_jobs=8)]: Done   1 out of  17 | elapsed:    1.0s remaining:   18.2s
[Parallel(n_jobs=8)]: Done   9 out of  17 | elapsed:    1.8s remaining:    1.6s
[Parallel(n_jobs=8)]: Done  14 out of  17 | elapsed:    1.9s remaining:    0.3s
[Parallel(n_jobs=8)]: Done  17 out of  17 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Done   4 out of  17 | elapsed:    3.6s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.065, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   39.1s finished
[Parallel(n_jobs=8)]: Done   1 out of  18 | elapsed:    3.9s remaining:  1.1min
[Parallel(n_jobs=8)]: Done   5 out of  18 | elapsed:    4.0s remaining:   10.7s
[Parallel(n_jobs=8)]: Done  10 out of  18 | elapsed:    7.6s remaining:    6.1s
[Parallel(n_jobs=8)]: Done  15 out of  18 | elapsed:    7.9s remaining:    1.5s
[Parallel(n_jobs=8)]: Done  18 out of  18 | elapsed:    9.8s finished
[Parallel(n_jobs=8)]: Done   1 out of  18 | elapsed:    0.9s remaining:   16.8s
[Parallel(n_jobs=8)]: Done   5 out of  18 | elapsed:    1.0s remaining:    2.7s
[Parallel(n_jobs=8)]: Done  10 out of  18 | elapsed:    1.9s remaining:    1.5s
[Parallel(n_jobs=8)]: Done  15 out of  18 | elapsed:    2.0s remaining:    0.3s
[Parallel(n_jobs=8)]: Done  18 out of  18 | elapsed:    2.5s finished
[Parallel(n_jobs=8)]: Done   5 out of  18 | elapsed:    3.9s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.065, RMSE (train) = 0.035, RMSE (valid) = 0.120

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   40.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   40.1s finished
[Parallel(n_jobs=8)]: Done   6 out of  19 | elapsed:    3.9s remaining:    8.6s
[Parallel(n_jobs=8)]: Done   1 out of  19 | elapsed:    3.9s remaining:  1.2min
[Parallel(n_jobs=8)]: Done  11 out of  19 | elapsed:    7.5s remaining:    5.5s
[Parallel(n_jobs=8)]: Done  16 out of  19 | elapsed:    8.0s remaining:    1.4s
[Parallel(n_jobs=8)]: Done  19 out of  19 | elapsed:   10.3s finished
[Parallel(n_jobs=8)]: Done   6 out of  19 | elapsed:    1.1s remaining:    2.4s
[Parallel(n_jobs=8)]: Done   1 out of  19 | elapsed:    1.1s remaining:   20.9s
[Parallel(n_jobs=8)]: Done  11 out of  19 | elapsed:    2.1s remaining:    1.5s
[Parallel(n_jobs=8)]: Done  16 out of  19 | elapsed:    2.3s remaining:    0.3s
[Parallel(n_jobs=8)]: Done  19 out of  19 | elapsed:    2.8s finished
[Parallel(n_jobs=8)]: Done   6 out of  19 | elapsed:    3.8s remaini


building tree 1 of 1
 R^2 (train) = 0.291, R^2 (valid) = 0.062, RMSE (train) = 0.035, RMSE (valid) = 0.120


### Outcome

The best RMSE value that we got was 0.120. In NYC each longitude is approx 53 miles & latitude is approx 69 miles (see reference below). This gives an **error range of 6.36 x 8.28 square miles**. So we do not have a great predictor here (but a great learning experience in modeling this problem)

note: 1. We have not standardized lat & long in same scale - ideally this shd be done part of data prep. This would have given us a RMSE that we can apply properly to find exact error in distance.

#### Reference [link](http://geography.about.com/library/faq/blqzdistancedegree.htm)
Each degree of latitude is approximately 69 miles (111 kilometers) apart. The range varies (due to the earth's slightly ellipsoid shape) from 68.703 miles (110.567 km) at the equator to 69.407 (111.699 km) at the poles. 
A degree of longitude is widest at the equator at 69.172 miles (111.321) and gradually shrinks to zero at the poles. At 40° north or south the distance between a degree of longitude is 53 miles (85 km).