In [21]:
from sklearn.externals import joblib
from sklearn.model_selection import \
train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, \
GradientBoostingRegressor
from sklearn.linear_model import LassoCV, \
ElasticNetCV
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
from scipy import sparse
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.layers import Input, Dense, \
BatchNormalization, Dropout, Activation
from keras.models import Model
from keras import backend as K
from tqdm import tqdm

# Motivation

My motivation behind this last processing step is that taxi journey which starts and ends at similar point should have similar duration / traj length. Even though this might not be true, after exploring the training dataset, I have observed that for some starting point and ending point, all taxi drivers take similar routes - meaning that the duration and trajlength values are the same. However, for some other starting point and ending point, the duration and trajlength varies by a lot - I predict that there might be jam / accident / poor routes taken by the taxi driver. That is where other information such as Date, Taxi Driver characteristic comes into play.

In the model Stage 2, and Stage 3 that we have created, we didnt consider the exact value for the starting point and ending point of the taxi ride - we only consider the neighbourhood where the starting point and ending point of the taxi ride originated from. Therefore, I want to try to use this information (start and end point) in refining the prediction from the test data.

However, after exploring the data further, only small percentage of the starting location and ending location of the test data is available at the train data. Therefore, I only refine the result for these particular values of the test data. 

In [2]:
from sklearn.metrics import make_scorer

def rmpse_loss_func(ground_truth, predictions):
    err = np.sqrt\
    (np.mean((np.true_divide\
              (predictions, ground_truth) - 1.)**2))
    return err

rmpse_loss  = make_scorer(rmpse_loss_func,
                          greater_is_better=False)

def rmpse(preds, dtrain):
    labels = dtrain.get_label()
    err = np.sqrt(np.mean\
                  ((np.true_divide(preds, 
                                   labels) - 1.)**2))
    return 'error', err

I will load the prediction for the test data in Stage 3

In [3]:
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test.csv')
df_target = pd.read_csv('stage_3_v2.csv')

In [4]:
pred_log_dur = joblib.load('Y_test_dur_pred_stage3v2.pkl')
pred_log_traj = joblib.load('Y_test_traj_pred_stage3v2.pkl')
pred_dur = np.exp(pred_log_dur)
pred_traj = np.exp(pred_log_traj)
n_test = df_test.shape[0]

Once again, we will only use the non-outlier training data (defined in Stage 1) to refine the prediction for the test data

In [5]:
non_outlier_index_stage1 =\
joblib.load('non_outlier_index_stage1.pkl')

In [6]:
df_train = df_train.loc[non_outlier_index_stage1, :]
df_train = df_train.reset_index(drop=True)
n_train = df_train.shape[0]

In [10]:
train_x_start = df_train['X_START'].values
train_y_start = df_train['Y_START'].values
train_x_end = df_train['X_END'].values
train_y_end = df_train['Y_END'].values
train_duration = df_train['DURATION'].values
train_trajlength = df_train['TRAJ_LENGTH'].values

In [11]:
test_x_start = df_test['X_START'].values
test_y_start = df_test['Y_START'].values
test_x_end = df_test['X_END'].values
test_y_end = df_test['Y_END'].values

The way we refine our prediction is as follows:
    
- For each observation in the test dataset:
    - Check the `X_START`, `X_END`, `Y_START`, `Y_END` of the observation in the test dataset
    - Check whether training data with similar `X_START`, `X_END`, `Y_START`, `Y_END` exists. We define similar as point which is at most 1 degree away from the `X_START`, `X_END`, `Y_START`, `Y_END` of the test data.
    - If it does not exist, we take the value of prediction from Stage 3 as the final prediction.
    - If it exist, we check the number of similar observation in the training data. We will use weighted average of the median of the duration and trajlength values from the training data and the final prediction of duration and trajlength from Stage 3
        - If there are only 1 similar observation in the training data, We will use low weight for the trajlength / duration values in the training data. We are unsure about the accuracy of the value in the training data as it might only contain multiple possible route / it might be an outlier value
        - If there are more than 1 similar observation in the training data, we will check the standard deviation of the duration values and the standard deviation of the trajlength values. If the standard deviation is high, we assume that there are many routes which the taxi driver can take, and thus this values might not be accurate in refining our prediction. Thus, we will take middle weight for the trajlength / duration values in the training data to refine our prediction of the trajlengthh / duration values from stage 3. Lastly, if the standard deviation is small, we can assume that most of the taxi driver takes similar routes to go to the destination point from the starting point. Thus, we will take high weighted average of the trajlength/ duration values from the training data to be combined with the prediction from the trajlength/ duration values from stage 3.

In [12]:
def main(param_low, param_med, param_high):
    final_pred_dur = np.zeros(n_test)
    final_pred_traj = np.zeros(n_test)
    for idx in tqdm(range(n_test)):
        x_start, y_start, x_end, y_end =\
        test_x_start[idx], test_y_start[idx], \
        test_x_end[idx], test_y_end[idx]
        df_small_idx = np.where\
        ((abs(abs(train_x_start) - \
              abs(x_start)) <= 1) & \
         (abs(abs(train_y_start) - abs(y_start))\
          <= 1) & (abs(abs(train_x_end) - abs(x_end))\
                   <= 1) & (abs(abs(train_y_end) \
                                - abs(y_end)) <= 1))[0]
        md_pred_dur = pred_dur[idx]
        md_pred_traj = pred_traj[idx]
        if df_small_idx.shape[0] == 0:
            final_pred_dur[idx] = md_pred_dur
            final_pred_traj[idx] = md_pred_traj
        elif df_small_idx.shape[0] == 1:
            nb_pred_dur = train_duration[df_small_idx]
            nb_pred_traj = train_trajlength[df_small_idx]
            final_pred_dur[idx] = (param_low\
                                   * nb_pred_dur)\
            + ((1. - param_low) * md_pred_dur)
            final_pred_traj[idx] = (param_low \
                                    * nb_pred_traj)\
            + ((1. - param_low) * md_pred_traj)
        else:
            all_nb_pred_dur = train_duration[df_small_idx]
            all_nb_pred_traj = train_trajlength[df_small_idx]
            nb_pred_dur = np.median(all_nb_pred_dur)
            nb_pred_traj = np.median(all_nb_pred_traj)
            if np.std(all_nb_pred_dur) > 10.0:
                final_pred_dur[idx] = (param_med \
                                       * nb_pred_dur) \
                + ((1. - param_med) * md_pred_dur)
            else:
                final_pred_dur[idx] = (param_high \
                                       * nb_pred_dur) \
                + ((1. - param_high) * md_pred_dur)

            if np.std(all_nb_pred_traj) > 10.0:
                final_pred_traj[idx] = (param_med \
                                        * nb_pred_traj) \
                + ((1. - param_med) * md_pred_traj)
            else:
                final_pred_traj[idx] = (param_high \
                                        * nb_pred_traj)\
                + ((1.- param_high) * md_pred_traj)
    return final_pred_dur, final_pred_traj

In [13]:
final_pred_dur, final_pred_traj = main(0.2, 0.3, 0.4)

100%|██████████| 465172/465172 [40:15<00:00, 192.56it/s]


In [14]:
joblib.dump(final_pred_dur, 'final_pred_dur.pkl')
joblib.dump(final_pred_traj, 'final_pred_traj.pkl')

['final_pred_traj.pkl']

In [15]:
Y_dur_stage4 = np.log(final_pred_dur)
Y_traj_stage4 = np.log(final_pred_traj)
joblib.dump(Y_dur_stage4, 'Y_dur_stage4v3.pkl')
joblib.dump(Y_traj_stage4, 'Y_traj_stage4v3.pkl')

['Y_traj_stage4v3.pkl']

In [16]:
final_pred_price = final_pred_dur + final_pred_traj

We can then submit our refined prediction in Kaggle

In [17]:
test_id = pd.read_csv("test.csv").ID.values
data = {'ID': test_id,
       'PRICE': final_pred_price}
submission_df = pd.DataFrame(data = data)
submission_df.to_csv("stage_4_v3.csv", index=False)

In [23]:
pd.read_csv('stage_4_v3.csv').head()

Unnamed: 0,ID,PRICE
0,465173,301.033741
1,465174,274.367411
2,465175,446.473332
3,465176,853.708869
4,465177,432.465489
