In [1]:
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LassoCV
import xgboost as xgb
from scipy import sparse
import numpy as np

In [None]:
X_train_stage0 = joblib.load( 'X_train_stage0.pkl')
X_test_stage0 = joblib.load( 'X_test_stage0.pkl')
Y_train_duration = joblib.load( 'Y_train_duration.pkl')
Y_train_trajlength = joblib.load( 'Y_train_trajlength.pkl')
Y_train_price = joblib.load( 'Y_train_price.pkl')

In [None]:
n_train = X_train_stage0.shape[0]
n_test = X_test_stage0.shape[0]

In [None]:
print X_train_stage0.shape
print X_test_stage0.shape
print Y_train_duration.shape
print Y_train_trajlength.shape
print Y_train_price.shape

In [None]:
idx_train, idx_val = train_test_split(np.arange(n_train), test_size = 0.20)

In [7]:
X_test = X_test_stage0

In [8]:
X_train, X_val = X_train_stage0[idx_train], X_train_stage0[idx_val]
Y_train_dur, Y_val_dur = Y_train_duration[idx_train], Y_train_duration[idx_val]
Y_train_traj, Y_val_traj = Y_train_trajlength[idx_train], Y_train_trajlength[idx_val]

KeyboardInterrupt: 

In [None]:
print X_train.shape
print X_val.shape
print Y_train_dur.shape
print Y_val_dur.shape
print Y_train_traj.shape
print Y_val_traj.shape

In [None]:
sX_train = sparse.csc_matrix(X_train)
sX_val = sparse.csc_matrix(X_val)
sX_test = sparse.csc_matrix(X_test)

In [None]:
dtest = xgb.DMatrix(sX_test)

In [None]:
dtrain_dur = xgb.DMatrix(sX_train, label= Y_train_dur)
dval_dur = xgb.DMatrix(sX_val, label=Y_val_dur)
dtrain_traj = xgb.DMatrix(sX_train, label= Y_train_traj)
dval_traj = xgb.DMatrix(sX_val, label =Y_val_traj)

In [None]:
from sklearn.metrics import make_scorer

def rmpse_loss_func(ground_truth, predictions):
    idx = np.where(ground_truth > 0.)
    ground_truth = ground_truth[idx]
    predictions = predictions[idx]
    err = np.sqrt(np.mean((np.true_divide(predictions, ground_truth) - 1.)**2))
    return err

rmpse_loss  = make_scorer(rmpse_loss_func, greater_is_better=False)

In [None]:
def rmpse(preds, dtrain):
    labels = dtrain.get_label()
    idx = np.where(labels > 0.)
    labels = labels[idx]
    preds = preds[idx]
    err = np.sqrt(np.mean((np.true_divide(preds, labels) - 1.)**2))
    return 'error', err

In [None]:
watchlist_dur = [(dval_dur, 'eval_dur'), (dtrain_dur, 'train_dur')]
watchlist_traj = [(dval_traj, 'eval_traj'), (dtrain_traj, 'train_traj')]

# Dur

In [None]:
param = { 'objective' : "reg:linear", 
          'booster' : "gbtree",
          'eta'                 :0.01, 
          'max_depth'           :12, 
          'colsample_bytree'    : 0.7,
          'subsample' : 0.7,
          'gamma' : 1,
          'min_child_weight' : 5,
          'n_thread' : 8
        }

In [None]:
bst_dur = xgb.train(param, dtrain_dur, evals=watchlist_dur, 
                num_boost_round = 350, feval= rmpse, maximize = False)

In [None]:
rf_params = {"max_depth": [5, 9, 15, 25, None],
              "max_features": [0.1, 'sqrt', 0.5 , None],
              "n_estimators":[1000],
              "criterion": ["mae", "mse"],
            }

In [None]:
rf_dur = RandomForestRegressor( verbose = 3, n_jobs = -1)
rf_CV_dur = GridSearchCV(estimator=rf_dur, 
                   param_grid=rf_params,
                   scoring=rmpse_loss,
                   cv=3, verbose= 2,
                   n_jobs = -1,
                   return_train_score = True).fit(sX_train, Y_train_dur)

In [None]:
X_ensemble_dur = np.zeros((372137, 2))
X_ensemble_dur[:,0] = bst_dur.predict(dtrain_dur)
X_ensemble_dur[:,1] = rf_CV_dur.predict(X_train)
lasso_dur = LassoCV().fit(X_ensemble_dur, Y_train_dur)

In [None]:
joblib.dump(bst_dur, 'bst_dur.pkl')
joblib.dump(rf_CV_dur, 'rf_CV_dur.pkl')
joblib.dump(lasso_dur, 'lasso_dur.pkl')

In [None]:
print Y_train_dur.shape
print Y_val_dur.shape

In [None]:
bst_train = rmpse_loss_func(Y_train_dur ,bst_dur.predict(dtrain_dur))
bst_val = rmpse_loss_func(Y_val_dur,bst_dur.predict(dval_dur))
print -bst_train
print -bst_val

In [None]:
print rmpse_loss(rf_CV_dur, sX_train, Y_train_dur)
print rmpse_loss(rf_CV_dur, sX_val, Y_val_dur)

In [None]:
X_train_ens = np.zeros((372137, 2))
X_train_ens[:,0] = bst_dur.predict(dtrain_dur)
X_train_ens[:,1] = rf_dur.predict(sX_train)
X_val_ens = np.zeros((93035, 2))
X_val_ens[:,0] = bst_dur.predict(dval_dur)
X_val_ens[:,1] = rf_dur.predict(sX_val)
print rmpse_loss(lasso_dur, X_train_ens, Y_train_dur)
print rmpse_loss(lasso_dur, X_val_ens, Y_val_dur)

# TRAJ

In [None]:
param = { 'objective' : "reg:linear", 
          'booster' : "gbtree",
          'eta'                 :0.02, 
          'max_depth'           :15, 
          'colsample_bytree'    : 0.7,
          'subsample' : 0.7,
          'gamma' : 1,
          'min_child_weight' : 5,
          'n_thread' : 8
        }

In [None]:
bst_traj = xgb.train(param, dtrain_traj, evals=watchlist_traj, 
                num_boost_round = 350, feval= rmpse, maximize = False)

In [None]:
rf_params = {"max_depth": [5, 9, 15, 25, None],
              "max_features": [0.1, 'sqrt', 0.5 , None],
              "n_estimators":[1000],
              "criterion": ["mae", "mse"],
            }

In [None]:
rf_traj = RandomForestRegressor(verbose = 3, n_jobs = -1)
rf_CV_traj = GridSearchCV(estimator=rf_traj, 
                   param_grid=rf_params,
                   scoring=rmpse_loss,
                   cv=3, verbose= 2,
                   n_jobs = -1,
                   return_train_score = True).fit(sX_train, Y_train_traj)

In [None]:
X_ensemble_traj = np.zeros((372137, 2))
X_ensemble_traj[:,0] = bst_traj.predict(dtrain_traj)
X_ensemble_traj[:,1] = rf_CV_traj.predict(X_train)
lasso_traj = LassoCV().fit(X_ensemble_traj, Y_train_traj)

In [None]:
bst_train = rmpse_loss_func(Y_train_traj ,bst_traj.predict(dtrain_traj))
bst_val = rmpse_loss_func(Y_val_traj,bst_traj.predict(dval_traj))
print -bst_train
print -bst_val

In [None]:
print rmpse_loss(rf_CV_traj, sX_train, Y_train_traj)
print rmpse_loss(rf_CV_traj, sX_val, Y_val_traj)

In [None]:
X_train_ens = np.zeros((372137, 2))
X_train_ens[:,0] = bst_traj.predict(dtrain_traj)
X_train_ens[:,1] = rf_traj.predict(sX_train)
X_val_ens = np.zeros((93035, 2))
X_val_ens[:,0] = bst_traj.predict(dval_traj)
X_val_ens[:,1] = rf_traj.predict(sX_val)
print rmpse_loss(lasso_traj, X_train_ens, Y_train_traj)
print rmpse_loss(lasso_traj, X_val_ens, Y_val_traj)

# Combine

In [None]:
X_test_ens = np.zeros((n_test, 2))
X_test_ens[:,0] = bst_dur.predict(dtest)
X_test_ens[:,1] = rf_dur.predict(sX_test)
Y_test_dur = lasso_dur.predict(X_test_ens)

In [None]:
X_test_ens = np.zeros((n_test, 2))
X_test_ens[:,0] = bst_traj.predict(dtest)
X_test_ens[:,1] = rf_traj.predict(sX_test)
Y_test_traj = lasso_traj.predict(X_test_ens)