In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datasist as ds
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('max_colwidth', 100)

In [None]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
riders = pd.read_csv("Riders.csv")
sample = pd.read_csv("SampleSubmission.csv")
datadef = pd.read_csv("VariableDefinitions.csv")

In [None]:
print('Id is unique.') if train['Order No'].nunique() == train.shape[0] else print('Id is not unique')
print('Train and test sets are distinct.') if len(np.intersect1d(train['Order No'], test['Order No']))== 0 else print('oops')

In [None]:
train.head()

In [None]:
#Drop
col_2_drop = ['Arrival at Destination - Day of Month', 'Arrival at Destination - Weekday (Mo = 1)']
train.drop(col_2_drop, axis=1, inplace=True)

In [None]:
#Merge riders dataset to train and test datasets
train_merged = train.merge(riders, how='left', on='Rider Id')
test_merged = test.merge(riders, how='left', on='Rider Id')

In [None]:
print("shape of train data {}".format(train_merged.shape))
print("shape of test data {}".format(test_merged.shape))

In [None]:
ds.structdata.describe(train_merged)

In [None]:
ds.feature_engineering.drop_redundant(train_merged)
ds.feature_engineering.drop_redundant(test_merged)

In [None]:
ds.feature_engineering.drop_missing(train_merged, percent=80)
ds.feature_engineering.drop_missing(test_merged, percent=80)

In [None]:
#Get columns
cat_cols = ds.structdata.get_cat_feats(train_merged)
num_cols = ds.structdata.get_num_feats(train_merged)

time_cols = ['Pickup - Time', 'Arrival at Pickup - Time', 
             'Confirmation - Time', 'Placement - Time', 'Arrival at Destination - Time']

In [None]:
cat_cols

Get information from date columns

In [None]:
test_merged.head()

In [None]:
for col in time_cols:
    train_merged[col] = pd.to_datetime(train_merged[col])
    if col == 'Arrival at Destination - Time':
        pass
    else:
        test_merged[col] = pd.to_datetime(test_merged[col])

Get total seconds from Arrival at Destination Time and use it to calculate statistics on the train set.

In [None]:
train_merged['Time from Placement to Arrival'] = (train_merged['Arrival at Destination - Time'] - train_merged['Placement - Time']).map(lambda x: x.total_seconds())
train_merged['Time from Confirmation to Arrival'] = (train_merged['Arrival at Destination - Time'] - train_merged['Confirmation - Time']).map(lambda x: x.total_seconds())
train_merged['Time from Arrival Pickup to Arrival'] = (train_merged['Arrival at Destination - Time'] - train_merged['Arrival at Pickup - Time']).map(lambda x: x.total_seconds())


test_merged['Time from Placement to Arrival'] = (test_merged['Arrival at Destination - Time'] - test_merged['Placement - Time']).map(lambda x: x.total_seconds())
test_merged['Time from Confirmation to Arrival'] = (test_merged['Arrival at Destination - Time'] - test_merged['Confirmation - Time']).map(lambda x: x.total_seconds())
test_merged['Time from Arrival Pickup to Arrival'] = (test_merged['Arrival at Destination - Time'] - test_merged['Arrival at Pickup - Time']).map(lambda x: x.total_seconds())



train_merged['Time From Placement to Confirmation'] = (train_merged['Confirmation - Time'] - train_merged['Placement - Time']).map(lambda x: x.total_seconds())
train_merged['Time From Placement to Arrival at Pickup']  = (train_merged['Arrival at Pickup - Time'] - train_merged['Placement - Time']).map(lambda x: x.total_seconds())
train_merged['Time From Placement to Pickup']  = (train_merged['Pickup - Time'] - train_merged['Placement - Time']).map(lambda x: x.total_seconds())
train_merged['Time From Arrival Pickup to Pickup']  = (train_merged['Pickup - Time'] - train_merged['Arrival at Pickup - Time']).map(lambda x: x.total_seconds())
train_merged['Time From Confirmation to Arrival Pickup']  = (train_merged['Arrival at Pickup - Time'] - train_merged['Confirmation - Time']).map(lambda x: x.total_seconds())


test_merged['Time From Placement to Confirmation'] = (test_merged['Confirmation - Time'] - test_merged['Placement - Time']).map(lambda x: x.total_seconds())
test_merged['Time From Placement to Arrival at Pickup']  = (test_merged['Arrival at Pickup - Time'] - test_merged['Placement - Time']).map(lambda x: x.total_seconds())
test_merged['Time From Placement to Pickup']  = (test_merged['Pickup - Time'] - test_merged['Placement - Time']).map(lambda x: x.total_seconds())
test_merged['Time From Arrival Pickup to Pickup']  = (test_merged['Pickup - Time'] - test_merged['Arrival at Pickup - Time']).map(lambda x: x.total_seconds())
test_merged['Time From Confirmation to Arrival Pickup']  = (test_merged['Arrival at Pickup - Time'] - test_merged['Confirmation - Time']).map(lambda x: x.total_seconds())


In [None]:
#get hours, minute and seconds from time columns
cols = time_cols
cols.remove('Arrival at Destination - Time')
train_merged = ds.timeseries.extract_time(data=train_merged, time_cols=cols)
test_merged = ds.timeseries.extract_time(data=test_merged, time_cols=cols)

In [None]:
ds.structdata.display_missing(train_merged)

In [None]:
ds.structdata.display_missing(test_merged)

In [None]:
#fill missing
ds.feature_engineering.fill_missing_num(train_merged, features=['Temperature'])
ds.feature_engineering.fill_missing_num(test_merged, features=['Temperature'])

In [None]:
#Drop Arrival at Destination from train dataset
train_merged.drop('Arrival at Destination - Time', axis=1, inplace=True)

In [None]:
train_merged.shape

In [None]:
test_merged.shape

## Validation Strategy
First let's check the train test split. It helps to decide our validation strategy and gives ideas about feature engineering.

In [None]:
plt.plot(train_merged.groupby('Pickup - Time_hours').count()[['Rider Id']], '-o', label='train')
plt.plot(test_merged.groupby('Pickup - Time_hours').count()[['Rider Id']], '-o', label='test')
plt.title('Train and test hours overlap.')
plt.legend(loc=0)
plt.ylabel('number of records')
plt.show()

In [None]:
plt.plot(train_merged.groupby('Pickup - Day of Month').count()[['Rider Id']], '-o', label='train')
plt.plot(test_merged.groupby('Pickup - Day of Month').count()[['Rider Id']], '-o', label='test')
plt.title('Train and test day overlap.')
plt.legend(loc=0)
plt.ylabel('number of records')
plt.show()

In [None]:
fig, ax = plt.subplots(ncols=2, sharex=True, sharey=True)
ax[0].scatter(train_merged['Pickup Long'].values, train['Pickup Lat'].values,
              color='blue', s=1, label='train', alpha=0.1)

ax[1].scatter(test_merged['Pickup Long'].values, test['Pickup Lat'].values,
              color='green', s=1, label='test', alpha=0.1)

fig.suptitle('Train and test area complete overlap.')
ax[0].legend(loc=0)
ax[1].legend(loc=0)
ax[0].set_ylabel('latitude')
ax[0].set_xlabel('longitude')
ax[1].set_xlabel('longitude')
plt.show()

The train and test set are similar, this means the test set was sampled randomly from the train set. i.e both train and test are iid. We can extend statistics calculated on train set to test set.

This allows us to use unsupervised learning and feature extraction to be applied on the full data set.

## Feature Extraction

### PCA

We use PCA to transform longitude and latitude coordinates. In this case it is not about dimension reduction since we transform 2D-> 2D. The rotation could help for decision tree splits.

In [None]:
from sklearn.decomposition import PCA 

In [None]:
coords = np.vstack((train_merged[['Pickup Lat', 'Pickup Long']].values,
                    train_merged[['Destination Lat', 'Destination Long']].values,
                    test_merged[['Pickup Lat', 'Pickup Long']].values,
                    test_merged[['Destination Lat', 'Destination Long']].values))

pca = PCA().fit(coords)

#Get the new components and save in individual columns
train_merged['pickup_pca0'] = pca.transform(train_merged[['Pickup Lat', 'Pickup Long']])[:, 0]
train_merged['pickup_pca1'] = pca.transform(train_merged[['Pickup Lat', 'Pickup Long']])[:, 1]
train_merged['dropoff_pca0'] = pca.transform(train_merged[['Destination Lat', 'Destination Long']])[:, 0]
train_merged['dropoff_pca1'] = pca.transform(train_merged[['Destination Lat', 'Destination Long']])[:, 1]

test_merged['pickup_pca0'] = pca.transform(test_merged[['Pickup Lat', 'Pickup Long']])[:, 0]
test_merged['pickup_pca1'] = pca.transform(test_merged[['Pickup Lat', 'Pickup Long']])[:, 1]
test_merged['dropoff_pca0'] = pca.transform(test_merged[['Destination Lat', 'Destination Long']])[:, 0]
test_merged['dropoff_pca1'] = pca.transform(test_merged[['Destination Lat', 'Destination Long']])[:, 1]

## Distance
Let's calculate the distance metrics between pickup and destination points. Currently Haversine is used, geopy has another heuristics (vincenty() or great_circle()) if you prefer. we could check the Manhattan (L1) distance too.

pd.DataFrame.apply() would be too slow so the haversine function is rewritten to handle arrays.
We extraxt the middle of the path as a feature as well.

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))



train_merged['distance_haversine'] = haversine_array(train_merged['Pickup Lat'].values, train_merged['Pickup Long'].values, train_merged['Destination Lat'].values, train_merged['Destination Long'].values)
train_merged['distance_dummy_manhattan'] = dummy_manhattan_distance(train_merged['Pickup Lat'].values, train_merged['Pickup Long'].values, train_merged['Destination Lat'].values, train_merged['Destination Long'].values)
train_merged['direction'] = bearing_array(train_merged['Pickup Lat'].values, train_merged['Pickup Long'].values, train_merged['Destination Lat'].values, train_merged['Destination Long'].values)
train_merged['pca_manhattan'] = np.abs(train_merged['dropoff_pca1'] - train_merged['pickup_pca1']) + np.abs(train_merged['dropoff_pca0'] - train_merged['pickup_pca0'])

test_merged['distance_haversine'] = haversine_array(test_merged['Pickup Lat'].values, test_merged['Pickup Long'].values, test_merged['Destination Lat'].values, test_merged['Destination Long'].values)
test_merged['distance_dummy_manhattan'] = dummy_manhattan_distance(test_merged['Pickup Lat'].values, test_merged['Pickup Long'].values, test_merged['Destination Lat'].values, test_merged['Destination Long'].values)
test_merged['direction'] = bearing_array(test_merged['Pickup Lat'].values, test_merged['Pickup Long'].values, test_merged['Destination Lat'].values, test_merged['Destination Long'].values)
test_merged['pca_manhattan'] = np.abs(test_merged['dropoff_pca1'] - test_merged['pickup_pca1']) + np.abs(test_merged['dropoff_pca0'] - test_merged['pickup_pca0'])

train_merged['center_latitude'] = (train_merged['Pickup Lat'].values + train_merged['Destination Lat'].values) / 2
train_merged['center_longitude'] = (train_merged['Pickup Long'].values + train_merged['Destination Long'].values) / 2
test_merged['center_latitude'] = (test_merged['Pickup Lat'].values + test_merged['Destination Lat'].values) / 2
test_merged['center_longitude'] = (test_merged['Pickup Long'].values + test_merged['Pickup Long'].values) / 2

## Clustering

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=30).fit(coords)

train_merged['pickup_cluster'] = kmeans.predict(train_merged[['Pickup Lat', 'Pickup Long']])
train_merged['dropoff_cluster'] = kmeans.predict(train_merged[['Destination Lat', 'Destination Long']])
test_merged['pickup_cluster'] = kmeans.predict(test_merged[['Pickup Lat', 'Pickup Long']])
test_merged['dropoff_cluster'] = kmeans.predict(test_merged[['Destination Lat', 'Destination Long']])


In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1)
ax.scatter(train['Pickup Long'], train['Pickup Lat'], s=10, lw=0,
           c=train_merged['pickup_cluster'], cmap='tab20', alpha=0.2)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.show()

In [None]:
all_data, n_train, n_test = ds.structdata.join_train_and_test(train_merged, test_merged)

In [None]:
all_data.drop(['Order No'], axis=1, inplace=True)

In [None]:
redundant_cols = ['Confirmation - Day of Month', 'Confirmation - Weekday (Mo = 1)', 
                  'Arrival at Pickup - Day of Month', 'Arrival at Pickup - Weekday (Mo = 1)', 
                  'Pickup - Day of Month', 'Pickup - Weekday (Mo = 1)', 'Pickup - Time_hours',
                 'Confirmation - Time_hours']

all_data = all_data.drop(redundant_cols, axis=1)

In [None]:
all_data.head(2).T

In [None]:
all_data['minute_diff_pickup_&_arrival'] = np.abs(all_data['Pickup - Time_minutes'] - all_data['Arrival at Pickup - Time_minutes'])
all_data['seconds_diff_pickup_&_arrival'] = np.abs(all_data['Pickup - Time_seconds'] - all_data['Arrival at Pickup - Time_seconds'])
all_data['minute_diff_confirm_&_placement'] = np.abs(all_data['Confirmation - Time_minutes'] - all_data['Placement - Time_minutes'])
all_data['seconds_diff_confirm_&_placement'] = np.abs(all_data['Confirmation - Time_seconds'] - all_data['Placement - Time_seconds'])
all_data['hour_diff__placement_&_arrivalpickup'] = np.abs(all_data['Placement - Time_hours'] - all_data['Arrival at Pickup - Time_hours'])

# to_drop = ['Pickup - Time_minutes','Arrival at Pickup - Time_minutes', 'Pickup - Time_seconds', 
#            'Arrival at Pickup - Time_seconds', 'Confirmation - Time_minutes', 'Placement - Time_minutes',
#           'Confirmation - Time_seconds', 'Placement - Time_seconds', 'Placement - Time_hours', 'Arrival at Pickup - Time_hours']

# all_data.drop(to_drop, axis=1, inplace=True)

In [None]:
locations = ['Pickup Lat', 'Pickup Long', 'Destination Lat', 'Destination Long']

# for loc in locations:
#     all_data[loc + '_binned'] = ds.feature_engineering.get_qcut(all_data, col=loc, q=20)

In [None]:
all_data.drop(locations, axis=1, inplace=True)

In [None]:
all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['Rider Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Pickup to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['User Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Pickup to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['Rider Id'], statistics=['mean'], 
                                                                    col_to_merge='Distance (KM)')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['User Id'], statistics=['mean'], 
                                                                    col_to_merge='Distance (KM)')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['Rider Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Placement to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['Rider Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Confirmation to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['Rider Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Arrival Pickup to Arrival')


all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['User Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Placement to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['User Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Confirmation to Arrival')

all_data = ds.feature_engineering.merge_groupby(all_data, cat_features=['User Id'], statistics=['mean'], 
                                                                    col_to_merge='Time from Arrival Pickup to Arrival')


to_drop = ['Time from Confirmation to Arrival', 'Time from Placement to Arrival', 'Time from Arrival Pickup to Arrival']

all_data.drop(to_drop, axis=1, inplace=True)

In [None]:
ds.structdata.display_missing(all_data)

In [None]:
ds.feature_engineering.fill_missing_num(all_data)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in ['User Id', 'Rider Id']:
    lb = LabelEncoder()
    lb.fit(all_data[col])
    all_data[col] = lb.transform(all_data[col])


In [None]:
all_data = pd.get_dummies(all_data, drop_first=True)
all_data.shape

In [None]:
train = all_data[:n_train]
test = all_data[n_train:]

target = train['Time from Pickup to Arrival']

train = train.drop(['Time from Pickup to Arrival'], axis=1)
test = test.drop(['Time from Pickup to Arrival'], axis=1)

col_names = list(train.columns ) # used for feature importance plot


In [None]:
# #get pca train set
# pca2 = PCA(n_components=30)
# pca2.fit(train)

# train_pca = pca2.transform(train)
# test_pca = pca2.transform(test)

In [None]:
# #Try feature selection
# from sklearn.feature_selection import SelectFromModel
# from lightgbm import LGBMRegressor


# lgb_model = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#               importance_type='split', learning_rate=0.01, max_depth=9,
#               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
#               n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
#               random_state=2, reg_alpha=1, reg_lambda=1, silent=True,
#               subsample=0.5, subsample_for_bin=200000, subsample_freq=0)

# skbest = SelectFromModel(estimator=lgb_model).fit(train, target)
# train_skbest = skbest.transform(train)
# test_skbesr = skbest.transform(test)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor,BaggingRegressor, VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score,train_test_split

# import xgboost as xgb
import lightgbm as lgb
import catboost as cb


In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=42)

In [None]:
#Scale features
scaler = StandardScaler()
scaler2 = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

scaler2.fit(train)
train = scaler2.transform(train)
test = scaler2.transform(test)

In [None]:
def rmse(mse):
    return np.sqrt(mse)

In [None]:
lgb_best = lgb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=9,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=2, reg_alpha=1, reg_lambda=1, silent=True,
              subsample=0.5, subsample_for_bin=200000, subsample_freq=0)

# xgb_model = xgb.XGBRegressor(n_estimators=2000, max_depth=9, n_jobs=-1, learning_rate=0.01, random_state=21)

cat_model = cb.CatBoostRegressor(n_estimators=2000, silent=True)

# estimators = [("cat", cat_model), ("xgb_model", xgb_model), ("lgb", lgb_best)]
estimators = [("cat", cat_model), ("lgb", lgb_best)]

vr = VotingRegressor(estimators)

bgg = BaggingRegressor(cat_model,n_estimators=6, random_state=1)

In [None]:
lgb_best.fit(X_train, y_train)
pred = lgb_best.predict(X_test)

print(rmse(mean_squared_error(y_test, pred)))

In [None]:
xgb_model.fit(X_train, y_train)
pred = xgb_model.predict(X_test)

print(rmse(mean_squared_error(y_test, pred)))

In [None]:
cat_model.fit(X_train, y_train)
pred = cat_model.predict(X_test)

print(rmse(mean_squared_error(y_test, pred)))

In [None]:
np.mean(rmse(-1 * cross_val_score(estimator=cat_model, X=train,y=target, cv=5,scoring='neg_mean_squared_error' )))

In [None]:
vr.fit(X_train, y_train)
pred = vr.predict(X_test)

print(rmse(mean_squared_error(y_test, pred)))

In [None]:
# #Hyperparameter search
# param_grid = {'n_estimators': [2000, 3000,4000], 
#              'max_depth': [8,9],
#              'learning_rate': [0.1,0.01] }


# grid = GridSearchCV(estimator=cat_model, param_grid=param_grid,scoring='neg_mean_squared_error',cv=2, verbose=1, n_jobs=-1)
# grid.fit(train, target)

In [None]:
# grid2 = GridSearchCV(estimator=xgb_model, param_grid=param_grid,scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
# grid2.fit(train, target)

In [None]:
# import pickle

# model_name = 'cat_model.sav'

# pickle.dump(cat_model, open(model_name, 'wb'))

In [None]:
# np.mean(rmse(-1 * cross_val_score(estimator=lgb_best, X=train,y=target, cv=5,scoring='neg_mean_squared_error' )))

In [None]:
ds.model.plot_feature_importance(cat_model, col_names)

In [None]:
cat_model.fit(train, target)
final_pred = cat_model.predict(test)

sample['Time from Pickup to Arrival'] = final_pred
sample.head()

In [None]:
sample.to_csv("cat_model_sendy_sat7.csv", index=False)