## Import libraries

In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\dom\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
pip install hyperopt

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\dom\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [3]:
import numpy as np
import pickle
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import json
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.base import clone
from sklearn.model_selection import cross_val_score

  import pandas.util.testing as tm


In [4]:
with open('train_data.npy', 'rb') as f:
    X_train = np.load(f)
    y_train = np.load(f)
    X_test = np.load(f)
    y_test = np.load(f)

In [5]:
#find optimal parameters for xgb.XGBRegressor with 5 evaluations
def acc_model(params):
    reg = xgb.XGBRegressor(**params)
    return cross_val_score(reg, X_train, y_train).mean()

param_space = {
    'objective': 'reg:squarederror',
    'colsample_bytree': hp.uniform('colsample_bytree', 0, 1), 
    'learning_rate': hp.quniform('learning_rate', 0, 1, 0.1),
    'max_depth': hp.choice('max_depth', range(1, 100)),
    'alpha': hp.choice('alpha', range(1, 20))
}

best = 0
def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    #print ('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=5, trials=trials)
print ('best:')
print (best)

100%|██████████| 5/5 [05:04<00:00, 47.63s/trial, best loss: -0.761752395556704]
best:
{'alpha': 7, 'colsample_bytree': 0.6141304492838555, 'learning_rate': 0.4, 'max_depth': 31}


In [6]:
model = xgb.XGBRegressor(**best)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
RMSE1 = round(mean_squared_error(y_test, y_pred, squared=False),2)

In [7]:
print(f'The RMSE = {RMSE1}')

The RMSE = 0.46


The RSME value is low lets see if it gets smaller and the model gets better with cross validation

In [8]:
total_features = np.concatenate((X_train, X_test), axis=0)
total_price = np.concatenate((y_train, y_test), axis=0)

In [9]:
#cross validation for model training indication
data_dmatrix = xgb.DMatrix(data=total_features, label=total_price)
cv_results = xgb.cv(dtrain=data_dmatrix, params=best, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [10]:
RSME=round(cv_results["test-rmse-mean"][cv_results["test-rmse-mean"]<1].head(1).values[0],2)
rounds=cv_results["test-rmse-mean"][cv_results["test-rmse-mean"]<1].head(1).index[0]

In [11]:
print(f'The RSME improves from {round(cv_results["test-rmse-mean"].head(1).values[0],2)} to {RSME} after {rounds} rounds')

The RSME improves from 7.39 to 0.75 after 5 rounds


In [12]:
with open('model.pkl', 'wb') as output:
       pickle.dump(model, output)

![title](fin.jpg)

The RSME that I've got is 0.46. The model is training and with additional trainings becomes better. 