In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

## Part 1. Preprocessing

In [3]:
trafficFile = 'train.csv'
traffic = pd.read_csv(trafficFile, index_col="id", parse_dates=[1], dayfirst=True)
traffic['hour'] = pd.to_datetime(traffic['date']).apply(lambda x: x.hour)
traffic['hour'] = traffic['hour']/23
traffic['weekday'] = pd.to_datetime(traffic['date']).apply(lambda x: x.dayofweek)
traffic['weekday'] = traffic['weekday']/6
traffic['month']=pd.to_datetime(traffic['date']).apply(lambda x:x.month)
traffic['month']=traffic['month']/12
traffic['day']=pd.to_datetime(traffic['date']).apply(lambda x:x.day)
traffic['day']=traffic['day']/30
max_time = max(traffic['date'])
min_time = min(traffic['date'])
traffic['time'] = (traffic['date']-min_time)/(max_time-min_time)
X = traffic[['time','weekday','hour','month','day']]
y = traffic['speed']

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.04,random_state=0)

In [4]:
data_test = pd.read_csv("test.csv", index_col='id', parse_dates=[1], dayfirst=True)
# data_test['datetime'] = pd.to_datetime(data_test['date'])
data_test['hour'] = pd.to_datetime(data_test['date']).apply(lambda x: x.hour)
data_test['hour'] = data_test['hour']/23
data_test['weekday'] = pd.to_datetime(data_test['date']).apply(lambda x: x.dayofweek)
data_test['weekday'] = data_test['weekday']/6
data_test['month'] = pd.to_datetime(data_test['date']).apply(lambda x:x.month)
data_test['month'] = data_test['month']/12
data_test['day'] = pd.to_datetime(data_test['date']).apply(lambda x:x.day)
data_test['day'] = data_test['day']/30
data_test['time'] = (data_test['date']-min_time)/(max_time-min_time)
predict_x = data_test[['time','weekday','hour','month','day']]

## Part 2. Training and Forecasting

We attain three xgboost model that have achieved a relatively good result, to improve the generaliztion ability, we choose to ensemble those models. Here two layers of model fusion are used. Level 1 uses: 3 XGBoost models with different parameters, and Level 2 uses LinearRegression to fit the results of the first layer

In [5]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=50).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                print ("Fit Model %d fold %d" % (i, j))
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)

        # results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        # print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
        # exit()

        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_test)[:]
        return res

# xgb params: 9.86
# xgb_params2 = {'learning_rate': 0.1, 'n_estimators': 148, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
#                 'subsample': 0.82, 'colsample_bytree': 1.0, 'gamma': 0.1, 'reg_alpha': 0.39, 'reg_lambda': 4.6,
#                 'n_jobs':-1}
xgb_params1 = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 10, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.82, 'colsample_bytree': 1.0, 'gamma': 0.1, 'reg_alpha': 0.39, 'reg_lambda': 4.6,
                'n_jobs':-1}
# xgb params: 9.66
xgb_params2 = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 9, 'min_child_weight': 1, 'seed': 0,
                'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0.2, 'reg_alpha': 3.7, 'reg_lambda': 0.36,
                'n_jobs':-1}
xgb_params3 = {'learning_rate':0.1,'n_estimators':184,'max_depth':10,'min_child_weight':1,'subsample':0.8,'colsample_bytree':1.0,
                           'gamma':0,'reg_alpha':0,'reg_lambda':1,'n_jobs':-1}


# XGB model
xgb_model1 = XGBRegressor(**xgb_params1)
xgb_model2 = XGBRegressor(**xgb_params2)
xgb_model3 = XGBRegressor(**xgb_params3)

stack = Ensemble(n_splits=20,
        stacker=LinearRegression(),
        base_models=(xgb_model1, xgb_model2,xgb_model3))
y_predict = stack.fit_predict(X, y, predict_x)

Fit Model 0 fold 0
Fit Model 0 fold 1
Fit Model 0 fold 2
Fit Model 0 fold 3
Fit Model 0 fold 4
Fit Model 0 fold 5
Fit Model 0 fold 6
Fit Model 0 fold 7
Fit Model 0 fold 8
Fit Model 0 fold 9
Fit Model 0 fold 10
Fit Model 0 fold 11
Fit Model 0 fold 12
Fit Model 0 fold 13
Fit Model 0 fold 14
Fit Model 0 fold 15
Fit Model 0 fold 16
Fit Model 0 fold 17
Fit Model 0 fold 18
Fit Model 0 fold 19
Fit Model 1 fold 0
Fit Model 1 fold 1
Fit Model 1 fold 2
Fit Model 1 fold 3
Fit Model 1 fold 4
Fit Model 1 fold 5
Fit Model 1 fold 6
Fit Model 1 fold 7
Fit Model 1 fold 8
Fit Model 1 fold 9
Fit Model 1 fold 10
Fit Model 1 fold 11
Fit Model 1 fold 12
Fit Model 1 fold 13
Fit Model 1 fold 14
Fit Model 1 fold 15
Fit Model 1 fold 16
Fit Model 1 fold 17
Fit Model 1 fold 18
Fit Model 1 fold 19
Fit Model 2 fold 0
Fit Model 2 fold 1
Fit Model 2 fold 2
Fit Model 2 fold 3
Fit Model 2 fold 4
Fit Model 2 fold 5
Fit Model 2 fold 6
Fit Model 2 fold 7
Fit Model 2 fold 8
Fit Model 2 fold 9
Fit Model 2 fold 10
Fit Model 

In [7]:
y_out = pd.DataFrame(y_predict,columns=['speed'])
y_out

Unnamed: 0,speed
0,48.447185
1,48.135054
2,35.927021
3,30.940552
4,38.856981
...,...
3499,13.835074
3500,22.457817
3501,45.070239
3502,40.464029


In [8]:
y_out.to_csv("xgboost_final_output.csv",index=True,index_label='id')