In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from skopt.space import Real, Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
from sklearn.model_selection import cross_val_score

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [25]:
model_w1 = pd.read_pickle('data/model/model3_w1.pkl')
model_w2 = pd.read_pickle('data/model/model3_w2.pkl')
model_w3 = pd.read_pickle('data/model/model3_w3.pkl')
model_w4 = pd.read_pickle('data/model/model3_w4.pkl')

In [54]:
model_w1['mean_time_events'] = model_w1['mean_time_events'].fillna(0)
model_w1['wifi_mean'] = model_w1['wifi_mean'].fillna(model_w1['wifi_mean'].mean())
model_w1['diff_events'] = model_w1['diff_events'].fillna(0)
model_w1['first_event_sec'] = model_w1['first_event_sec'].fillna(0)
model_w1['last_event_sec'] = model_w1['last_event_sec'].fillna(0)
model_w1['attributed_mean'] = model_w1['attributed_mean'].fillna(0)

In [55]:
model_w1['n_events'] = model_w1['n_events'].fillna(0)
model_w2['n_events'] = model_w2['n_events'].fillna(0)
model_w3['n_events'] = model_w3['n_events'].fillna(0)
model_w4['n_events'] = model_w4['n_events'].fillna(0)

In [56]:
model = pd.concat([model_w1, model_w2, model_w3, model_w4], ignore_index=True, sort=False)

In [11]:
model.to_pickle('data/model/model1.pkl')

In [64]:
model[model['time_appearence'] < 259200].shape

(37507, 31)

**Modelos**

In [28]:
model.drop(columns='ref_hash', inplace=True)

In [29]:
max_seconds = 87000

In [30]:
train, test = train_test_split(model, test_size=0.2)

In [31]:
train_Y = train['time_appearence']
train_X = train.drop(columns=['time_appearence'])
test_Y = test['time_appearence']
test_X = test.drop(columns=['time_appearence'])

**Base (promedio)**

In [32]:
mean_time = train_Y.mean()
np.sqrt(mean_squared_error(train_Y, np.full(train_Y.shape, mean_time)))

86024.3953094584

In [33]:
np.sqrt(mean_squared_error(test_Y, np.full(test_Y.shape, mean_time)))

86207.59441424183

**XGBoost**

In [65]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1, learning_rate = 0.1,
                max_depth = 14, alpha = 0, n_estimators = 200, gamma = 0)
xg_reg.fit(train_X, train_Y)

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=14, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [46]:
np.sqrt(mean_squared_error(train_Y, xg_reg.predict(train_X)))/max_seconds

0.24475502880601538

In [47]:
np.sqrt(mean_squared_error(test_Y, xg_reg.predict(test_X)))/max_seconds

0.8700660512450107

In [48]:
np.sqrt(mean_squared_error(train_Y, xg_reg.predict(train_X)))

21293.687506123337

In [49]:
np.sqrt(mean_squared_error(test_Y, xg_reg.predict(test_X)))

75695.74645831593

In [16]:
space  = [Integer(80, 100, name='n_estimators'),
          Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
          Integer(8, 15, name='max_depth'),
          Real(0.01, 0.5, "log-uniform", name='alpha'),
          Real(0.05, 1, name='colsample_bytree')]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set scikit-learn
# estimator parameters
@use_named_args(space)
def objective(**params):
    xg_reg.set_params(**params)

    return -np.mean(cross_val_score(xg_reg, train_X, train_Y, cv=5, n_jobs=-1,
                                    scoring="neg_mean_absolute_error"))

res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)

KeyboardInterrupt: 

In [98]:
res_gp

          fun: 46791.59921416599
    func_vals: array([53302.18774591, 55908.08816728, 49897.50923364, 54500.26530089,
       52474.47354897, 47802.56045926, 51507.2941305 , 56755.03832791,
       54297.66881425, 54950.68425361, 46959.18236723, 57236.31920252,
       46955.38511696, 46959.5869603 , 46957.21574916, 46957.21574916,
       52937.21386543, 46955.47322116, 46952.75822945, 46950.40576502,
       46948.36594847, 46952.19743406, 46952.96500169, 46953.01394625,
       46953.10311067, 46953.92190269, 46953.88803166, 46953.06934106,
       46953.86868935, 46952.96303067, 46952.96353762, 46951.89323727,
       46951.88656024, 46952.14221605, 47052.63103154, 47771.65010067,
       46805.32557035, 46888.49285122, 47274.04960373, 46813.0681015 ,
       48935.72359859, 46811.9639108 , 46888.45301444, 48344.67448439,
       47274.08474032, 46797.39209552, 46886.67221753, 46801.18192607,
       46791.59921417, 46948.50109856])
       models: [GaussianProcessRegressor(alpha=1e-10, copy_X

**lightgbm**

In [60]:
lgb_train = lgb.Dataset(train_X, train_Y)
lgb_eval = lgb.Dataset(test_X, test_Y, reference=lgb_train, silent=True)

In [61]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2_root'},
    'num_leaves': 30,
    'max_bin': 400,
    'learning_rate': 0.005,
    'verbose': 0
}

gbm_model = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_eval,
                early_stopping_rounds=10,
                verbose_eval=50)

pred_X = gbm_model.predict(train_X, num_iteration=gbm_model.best_iteration)
pred_Y = gbm_model.predict(test_X, num_iteration=gbm_model.best_iteration)

Training until validation scores don't improve for 10 rounds.
[50]	valid_0's rmse: 81836.2
[100]	valid_0's rmse: 79031.5
[150]	valid_0's rmse: 77206.5
[200]	valid_0's rmse: 76040.5
[250]	valid_0's rmse: 75268.2
[300]	valid_0's rmse: 74781.1
[350]	valid_0's rmse: 74467.7
[400]	valid_0's rmse: 74256.2
[450]	valid_0's rmse: 74135.5
[500]	valid_0's rmse: 74034.1
[550]	valid_0's rmse: 73936.2
[600]	valid_0's rmse: 73889.2
[650]	valid_0's rmse: 73848.3
[700]	valid_0's rmse: 73828.8
[750]	valid_0's rmse: 73813.9
[800]	valid_0's rmse: 73796.7
[850]	valid_0's rmse: 73783.4
[900]	valid_0's rmse: 73775.8
[950]	valid_0's rmse: 73763.3
[1000]	valid_0's rmse: 73753
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 73753


In [62]:
np.sqrt(mean_squared_error(train_Y, pred_X))

70977.52480586054

In [63]:
np.sqrt(mean_squared_error(test_Y, pred_Y))

73753.02042745467

**Promedio ponderado**

In [79]:
np.sqrt(mean_squared_error(test_Y, (xg_reg.predict(test_X)*0.245 + pred_Y*0.755)))

60849.75144474075

**Tuning XGBoost**

In [None]:
space  = [Integer(1, 5, name='num_leaves'),
          Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
          Integer(1, n_features, name='max_features'),
          Integer(2, 100, name='min_samples_split'),
          Integer(1, 100, name='min_samples_leaf')]

# this decorator allows your objective function to receive a the parameters as
# keyword arguments. This is particularly convenient when you want to set scikit-learn
# estimator parameters
@use_named_args(space)
def objective(**params):
    reg.set_params(**params)

    return -np.mean(cross_val_score(reg, X, y, cv=5, n_jobs=-1,
                                    scoring="neg_mean_absolute_error"))

res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)

In [30]:
xgtrain = xgb.DMatrix(train_X, label=train_Y)
xgb_param = xg_reg.get_xgb_params()

cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=40, nfold=4, metrics=['mse'],
     early_stopping_rounds=10, stratified=True, seed=1301)



XGBoostError: b'[20:35:32] src/metric/metric.cc:21: Unknown metric function mse'

In [28]:
cvresult.

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,100281.798828,25613.922274,145665.577149,61941.546968
1,89032.490235,20409.584995,138748.616211,58368.299514
2,80205.820313,16045.716553,134218.974609,53522.321695
3,72801.069336,12425.053146,131373.916992,48979.007946
4,66812.198242,9596.972955,129332.792969,44736.917297
5,61750.365235,7023.738447,127920.806641,41054.549212
6,57445.105469,4770.60072,126959.982422,38009.871599
7,54163.695313,3207.206254,126242.810547,35500.045521
8,51293.398438,1889.452937,125791.90039,33422.193878
9,48998.215821,827.16821,125467.099609,31758.300831


**Tuning**

In [72]:
from sklearn.model_selection import GridSearchCV

In [123]:
parameters = {'learning_rate':[0.18, 0.2, 0.22], 'max_depth':[11,12], 'gamma':[0], 'n_estimators':[150]}

In [124]:
xg_reg_cv = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 1)

In [125]:
clf = GridSearchCV(xg_reg_cv, parameters, scoring='neg_mean_squared_error', verbose=10)
clf.fit(train_X, train_Y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1281755585.9969873, total=  11.4s
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.1s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1301970233.8002338, total=  11.5s
[CV] gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.3s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=11, n_estimators=150, score=-1342804785.92895, total=  11.6s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   36.6s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1311504456.394685, total=  12.6s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   50.0s remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1325058356.0538707, total=  12.8s
[CV] gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150 .....


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.18, max_depth=12, n_estimators=150, score=-1335820461.9589322, total=  12.7s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1293190649.2683094, total=  11.5s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.5min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1356541355.417574, total=  11.4s
[CV] gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.7min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=11, n_estimators=150, score=-1344515054.9723446, total=  11.4s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.9min remaining:    0.0s


[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1301724010.7581942, total=  12.7s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......
[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1348194970.4823806, total=  13.1s
[CV] gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150 ......
[CV]  gamma=0, learning_rate=0.2, max_depth=12, n_estimators=150, score=-1348968277.9211082, total=  13.3s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1278433841.2215583, total=  12.4s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1341381089.21317, total=  11.8s
[CV] gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150 .....
[CV]  gamma=0, learning_rate=0.22, max_depth=11, n_estimators=150, score=-1347093946.4657004, total=  11.7s
[C

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.9min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'learning_rate': [0.18, 0.2, 0.22], 'max_depth': [11, 12], 'gamma': [0], 'n_estimators': [150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [126]:
np.sqrt(mean_squared_error(train_Y, clf.predict(train_X)))/max_seconds

0.09414913459614753

In [127]:
np.sqrt(mean_squared_error(test_Y, clf.predict(test_X)))/max_seconds

0.2620828689044466

In [128]:
clf.best_params_

{'gamma': 0, 'learning_rate': 0.18, 'max_depth': 11, 'n_estimators': 150}