In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [12]:
model_w1 = pd.read_pickle('data/model/model3_w1.pkl')

In [58]:
model_w1['mean_time_events'] = model_w1['mean_time_events'].fillna(0)
model_w1['wifi_mean'] = model_w1['wifi_mean'].fillna(model_w1['wifi_mean'].mean())
model_w1['diff_events'] = model_w1['diff_events'].fillna(0)
model_w1['n_events'] = model_w1['n_events'].fillna(0)
model_w1['first_event_sec'] = model_w1['first_event_sec'].fillna(0)
model_w1['last_event_sec'] = model_w1['last_event_sec'].fillna(0)
model_w1['attributed_mean'] = model_w1['attributed_mean'].fillna(0)

In [105]:
model_w1.to_pickle('data/model/model2_w1.pkl')

**XGBoost**

In [13]:
model_w1.drop(columns='ref_hash', inplace=True)

In [14]:
max_seconds = 129600

In [15]:
train, test = train_test_split(model_w1, test_size=0.2)

In [16]:
train_Y = train['time_appearence']
train_X = train.drop(columns=['time_appearence'])
test_Y = test['time_appearence']
test_X = test.drop(columns=['time_appearence'])

In [17]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = 0.18,
                max_depth = 11, alpha = 0, n_estimators = 150, gamma = 0)
xg_reg.fit(train_X, train_Y)

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.18, max_delta_step=0,
       max_depth=11, min_child_weight=1, missing=None, n_estimators=150,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [18]:
np.sqrt(mean_squared_error(train_Y, xg_reg.predict(train_X)))/max_seconds

0.009208830147866822

In [19]:
np.sqrt(mean_squared_error(test_Y, xg_reg.predict(test_X)))/max_seconds

0.664073356049899

In [117]:
mean_squared_error(train_Y, xg_reg.predict(train_X))

544193017.3319901

In [89]:
mean_squared_error(test_Y, xg_reg.predict(test_X))

1212671822.2865417

**Tuning**

In [72]:
from sklearn.model_selection import GridSearchCV

In [123]:
parameters = {'learning_rate':[0.18, 0.2, 0.22], 'max_depth':[11,12], 'gamma':[0], 'n_estimators':[150]}

In [124]:
xg_reg_cv = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1)

In [125]:
clf = GridSearchCV(xg_reg_cv, parameters, scoring='neg_mean_squared_error', verbose=10)
clf.fit(train_X, train_Y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1281755585.9969873, total=  11.4s
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.1s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1301970233.8002338, total=  11.5s
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.3s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1342804785.92895, total=  11.6s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.6s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1311504456.394685, total=  12.6s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   50.0s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1325058356.0538707, total=  12.8s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1335820461.9589322, total=  12.7s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1293190649.2683094, total=  11.5s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1356541355.417574, total=  11.4s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1344515054.9723446, total=  11.4s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.9min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1301724010.7581942, total=  12.7s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......
[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1348194970.4823806, total=  13.1s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......
[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1348968277.9211082, total=  13.3s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1278433841.2215583, total=  12.4s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1341381089.21317, total=  11.8s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1347093946.4657004, total=  11.7s
[C

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.18, 0.2, 0.22], 'max_depth': [11, 12], 'gamma': [0], 'n_estimators': [150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [126]:
np.sqrt(mean_squared_error(train_Y, clf.predict(train_X)))/max_seconds

0.09414913459614753

In [127]:
np.sqrt(mean_squared_error(test_Y, clf.predict(test_X)))/max_seconds

0.2620828689044466

In [128]:
clf.best_params_

{'gamma': 0, 'learning_rate': 0.18, 'max_depth': 11, 'n_estimators': 150}