In [302]:
import pandas as pd
import numpy as np

In [303]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [304]:
path = "/content/drive/My Drive/Hackhathon 2020 Shared Drive/Dataset/SG_car.csv"

In [305]:
data = pd.read_csv(path)

In [306]:
data = data.drop(['Unnamed: 0', 'trj_id', 'osname', 'pingtimestamp', 'timestamp', 'rawlat', 'rawlng', 'speed', 'bearing', 'accuracy', 'pickup_time', 'dropoff_time'], axis=1)

# Model Check Script

In [None]:
def xgboost(X_train, Y_train, X_val, Y_val, X_test, Y_test):
  dtrain = xgb.DMatrix(X_train, label=Y_train)
  dvalid = xgb.DMatrix(X_val, label=Y_val)
  dtest = xgb.DMatrix(X_test)
  watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

  model1 = xgb.XGBRegressor(n_estimators = 4000, min_child_weight= 22,
                          learning_rate=0.005, max_depth=10,
                          colsample_bytree= 0.9,subsample= 0.9, nthread=-1, 
                          booster= 'gbtree', silent= 1,eval_metric= 'rmse', 
                          objective= 'reg:linear',early_stopping_rounds=40, 
                          maximize = False, verbose_eval=1)
  model1.fit(X_train, Y_train)
  y_pred1 = model1.predict(X_test)
  mse = mean_squared_error(Y_test,y_pred1)
  return [np.sqrt(mse), model1]

In [None]:
def lgbm(X_train, Y_train, X_val, Y_val, X_test, Y_test):
  d_train = lgb.Dataset(X_train, label=Y_train)
  d_valid = lgb.Dataset(X_val, label=Y_val)
  model2 = lgb.LGBMRegressor(learning_rate=0.0011,boosting_type="gbdt",sub_feature=0.5,
                           objective = "regression",num_leaves=60,min_data = 20,
                           max_depth=10,max_bin=10,n_estimators = 13743)
  model2.fit(X_train, Y_train)
  y_pred2 = model2.predict(X_test)
  mse = mean_squared_error(Y_test,y_pred2)
  return [np.sqrt(mse), model2]

In [None]:
def catboost(X_train, Y_train, X_val, Y_val, X_test, Y_test):
  model3 = catb.CatBoostRegressor(iterations=1000,
                          learning_rate=0.02,
                          task_type = "CPU",
                          depth=10,
                         od_type = 'Iter',
                         od_wait=100)
  model3.fit(X_train, Y_train)
  y_pred3 = model3.predict(X_test)
  mse = mean_squared_error(Y_test,y_pred3)
  return [np.sqrt(mse), model3]

In [None]:
def ensemblexl(X_train, Y_train, X_val, Y_val, X_test, Y_test, model1, model2):
  estimators = [("xgb", model1), ("lgbm", model2)]
  model = VotingRegressor(estimators=estimators)
  model.fit(X_train, Y_train)
  y_pred = model.predict(X_test)
  mse = mean_squared_error(Y_test,y_pred)
  return np.sqrt(mse)

In [318]:
def final_modeller(n_modelling):
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import mean_squared_error
  from sklearn import preprocessing
  from sklearn.ensemble import VotingRegressor
  import xgboost as xgb
  import lightgbm as lgb
  import catboost as catb

  result_matrix = [0]*n_modelling
  for i in range(len(result_matrix)):
    result_matrix[i] = [0]*4

  for i in range(n_modelling):
    train, test = train_test_split(data, test_size=0.5, train_size = 0.5)
    validate, test = train_test_split(test, test_size = 0.3, train_size = 0.7)

    train_scaled = preprocessing.scale(train[train.columns.difference(['travel_time'])])
    X_train = train_scaled
    Y_train = train["travel_time"]
    val_scaled = preprocessing.scale(validate[validate.columns.difference(['travel_time'])])
    X_val = val_scaled
    Y_val = validate["travel_time"]
    test_scaled = preprocessing.scale(test[test.columns.difference(['travel_time'])])
    X_test = test_scaled
    Y_test = test["travel_time"]
    xgbResult = xgboost(X_train, Y_train, X_val, Y_val, X_test, Y_test)
    lgbmResult = lgbm(X_train, Y_train, X_val, Y_val, X_test, Y_test)
    catboostResult = catboost(X_train, Y_train, X_val, Y_val, X_test, Y_test)
    result_matrix[i][0] = xgbResult[0]
    result_matrix[i][1] = lgbmResult[0]
    result_matrix[i][2] = catboostResult[0]
    result_matrix[i][3] = ensemblexl(X_train, Y_train, X_val, Y_val, X_test, Y_test, xgbResult[1], lgbmResult[1])

  xgbAvg, lgbmAvg, catbAvg, enseXLAvg = 0,0,0,0
  xgbMin, lgbmMin, catbMin, enseXLMin = 1000,1000,1000,1000
  xgbMax, lgbmMax, catbMax, enseXLMax = 0,0,0,0

  for i in range(n_modelling):
    xgbAvg += result_matrix[i][0]
    if xgbMin > result_matrix[i][0]:
      xgbMin = result_matrix[i][0]
    if xgbMax < result_matrix[i][0]:
      xgbMax = result_matrix[i][0]

    lgbmAvg += result_matrix[i][1]
    if lgbmMin > result_matrix[i][1]:
      lgbmMin = result_matrix[i][1]
    if lgbmMax < result_matrix[i][1]:
      lgbmMax = result_matrix[i][1]

    catbAvg += result_matrix[i][2]
    if catbMin > result_matrix[i][2]:
      catbMin = result_matrix[i][2]
    if catbMax < result_matrix[i][2]:
      catbMax = result_matrix[i][2]

    enseXLAvg += result_matrix[i][3]
    if enseXLMin > result_matrix[i][3]:
      enseXLMin = result_matrix[i][3]
    if enseXLMax < result_matrix[i][3]:
      enseXLMax = result_matrix[i][3]

  xgbAvg = xgbAvg/n_modelling
  lgbmAvg = lgbmAvg/n_modelling
  catbAvg = catbAvg/n_modelling
  enseXLAvg = enseXLAvg/n_modelling

  print("The mean loss statistics for each model:")
  print("xgboost            : Max="+str(round(xgbMax,2))+"; Min="+str(round(xgbMin,2))+"; Avg="+str(round(xgbAvg)))
  print("lightGBM           : Max="+str(round(lgbmMax,2))+"; Min="+str(round(lgbmMin,2))+"; Avg="+str(round(lgbmAvg,2)))
  print("catboost           : Max="+str(round(catbMax,2))+"; Min="+str(round(catbMin,2))+"; Avg="+str(round(catbAvg,2)))
  print("ensemble xgb & lgbm: Max="+str(round(enseXLMax,2))+"; Min="+str(round(enseXLMin,2))+"; Avg="+str(round(enseXLAvg,2)))
  from google.colab import output
  output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')
  return result_matrix

In [319]:
print(final_modeller(8))

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
7:	learn: 1119.1795956	total: 953ms	remaining: 1m 58s
8:	learn: 1098.8777194	total: 1.08s	remaining: 1m 58s
9:	learn: 1078.9754487	total: 1.21s	remaining: 1m 59s
10:	learn: 1059.3120944	total: 1.23s	remaining: 1m 50s
11:	learn: 1040.1415788	total: 1.36s	remaining: 1m 51s
12:	learn: 1021.3524242	total: 1.4s	remaining: 1m 45s
13:	learn: 1003.1123931	total: 1.53s	remaining: 1m 47s
14:	learn: 984.9261005	total: 1.65s	remaining: 1m 48s
15:	learn: 967.4179917	total: 1.77s	remaining: 1m 48s
16:	learn: 950.3260458	total: 1.89s	remaining: 1m 49s
17:	learn: 933.3419402	total: 1.92s	remaining: 1m 44s
18:	learn: 916.6150588	total: 1.95s	remaining: 1m 40s
19:	learn: 900.5503995	total: 2.08s	remaining: 1m 42s
20:	learn: 884.8060957	total: 2.21s	remaining: 1m 42s
21:	learn: 869.3975780	total: 2.29s	remaining: 1m 41s
22:	learn: 854.1883611	total: 2.42s	remaining: 1m 42s
23:	learn: 839.5031006	total: 2.55s	remaining: 1m 43s
24:	le

## MODELLING

In [307]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train, test = train_test_split(data, test_size=0.5, train_size = 0.5, random_state=10)
validate, test = train_test_split(test, test_size = 0.3, train_size = 0.7, random_state=10)
print(len(train))
print(len(validate))
print(len(test))

from sklearn import preprocessing
train_scaled = preprocessing.scale(train[train.columns.difference(['travel_time'])])
X_train = train_scaled
Y_train = train["travel_time"]
val_scaled = preprocessing.scale(validate[validate.columns.difference(['travel_time'])])
X_val = val_scaled
Y_val = validate["travel_time"]
test_scaled = preprocessing.scale(test[test.columns.difference(['travel_time'])])
X_test = test_scaled
Y_test = test["travel_time"]

14000
9800
4200


In [308]:
import timeit
"""start = timeit.default_timer()
main()
end = timeit.default_timer()
print("Time taken in seconds:{}".format(end-start))"""

'start = timeit.default_timer()\nmain()\nend = timeit.default_timer()\nprint("Time taken in seconds:{}".format(end-start))'

XGBoost

In [238]:
from sklearn.ensemble import VotingRegressor

In [239]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=Y_train)
dvalid = xgb.DMatrix(X_val, label=Y_val)
dtest = xgb.DMatrix(X_test)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [241]:
model1 = xgb.XGBRegressor(n_estimators = 4000, min_child_weight= 22,
                          learning_rate=0.005, max_depth=10,
                          colsample_bytree= 0.9,subsample= 0.9, nthread=-1, 
                          booster= 'gbtree', silent= 1,eval_metric= 'rmse', 
                          objective= 'reg:linear',early_stopping_rounds=40, 
                          maximize = False, verbose_eval=1)

In [242]:
model1.fit(X_train, Y_train)
y_pred1 = model1.predict(X_test)
mse = mean_squared_error(Y_test,y_pred1)
print("RMSE: %.2f" % np.sqrt(mse))

RMSE: 239.96


LightGBM

In [309]:
import lightgbm as lgb

d_train = lgb.Dataset(X_train, label=Y_train)
d_valid = lgb.Dataset(X_val, label=Y_val)

In [313]:
model2 = lgb.LGBMRegressor(learning_rate=0.0011,boosting_type="gbdt",sub_feature=0.5,
                           objective = "regression",num_leaves=60,min_data = 20,
                           max_depth=10,max_bin=10,n_estimators = 13743)

In [314]:
model2.fit(X_train, Y_train)
y_pred2 = model2.predict(X_test)
mse = mean_squared_error(Y_test,y_pred2)
print("RMSE: %.2f" % np.sqrt(mse))

RMSE: 237.94


CatBoost

In [15]:
pip install catboost==0.7.2

Collecting catboost==0.7.2
[?25l  Downloading https://files.pythonhosted.org/packages/05/f0/20a4783ec2f409ba8c5eb80f7010df81b5aa10b3ee5a56170b62d75f5230/catboost-0.7.2-cp36-none-manylinux1_x86_64.whl (33.4MB)
[K     |████████████████████████████████| 33.4MB 119kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.7.2


In [246]:
import catboost as catb

In [247]:
model3 = catb.CatBoostRegressor(iterations=1000,
                          learning_rate=0.02,
                          task_type = "CPU",
                          depth=10,
                         od_type = 'Iter',
                         od_wait=100)

In [248]:
model3.fit(X_train, Y_train)
y_pred3 = model3.predict(X_test)
mse = mean_squared_error(Y_test,y_pred3)
print("RMSE: %.2f" % np.sqrt(mse))

0:	learn: 1280.9823792	total: 23.6ms	remaining: 23.6s
1:	learn: 1257.5886805	total: 150ms	remaining: 1m 15s
2:	learn: 1234.4824839	total: 187ms	remaining: 1m 2s
3:	learn: 1211.9798736	total: 319ms	remaining: 1m 19s
4:	learn: 1190.1864689	total: 435ms	remaining: 1m 26s
5:	learn: 1168.3975098	total: 471ms	remaining: 1m 18s
6:	learn: 1147.3740819	total: 593ms	remaining: 1m 24s
7:	learn: 1126.7035013	total: 649ms	remaining: 1m 20s
8:	learn: 1106.4347896	total: 729ms	remaining: 1m 20s
9:	learn: 1086.6703528	total: 779ms	remaining: 1m 17s
10:	learn: 1067.2317951	total: 865ms	remaining: 1m 17s
11:	learn: 1048.2213390	total: 899ms	remaining: 1m 14s
12:	learn: 1029.4095700	total: 944ms	remaining: 1m 11s
13:	learn: 1011.0573530	total: 1.08s	remaining: 1m 16s
14:	learn: 993.2461192	total: 1.21s	remaining: 1m 19s
15:	learn: 975.8502148	total: 1.33s	remaining: 1m 22s
16:	learn: 958.6651814	total: 1.39s	remaining: 1m 20s
17:	learn: 942.2510914	total: 1.52s	remaining: 1m 22s
18:	learn: 925.8729342	to

Ensemble XGBoost & LightGBM

In [249]:
estimators = [("xgb", model1), ("lgbm", model2)]

model = VotingRegressor(estimators=estimators)

In [250]:
model.fit(X_train, Y_train)

VotingRegressor(estimators=[('xgb',
                             XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=0.9,
                                          early_stopping_rounds=40,
                                          eval_metric='rmse', gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.005, max_delta_step=0,
                                          max_depth=10, maximize=False,
                                          min_child_weight=22, missing=None,
                                          n_estimators=4000, n_jobs=1,
                                          nthread=-1...
                                           importance_type='split',
                                           learning_rate=0.0011, max_

In [251]:
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test,y_pred)
print("RMSE: %.2f" % np.sqrt(mse))

RMSE: 241.14


In [252]:
from google.colab import output
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
xgb     :230,240
lgbm    :237,252
catb    :233,241
xgb+lgbm:239,241
xgboost            : Max=235.04; Min=235.04; Avg=235.04
lightGBM           : Max=245.69; Min=245.69; Avg=245.69
catboost           : Max=241.27; Min=241.16; Avg=241.21
ensemble xgb & lgbm: Max=236.93; Min=236.93; Avg=236.93