# Get out-of-fold predictions from xgboost.cv in python

## OOF Predictions in R

In [None]:
library(xgboost)
data(mtcars)
xgb_params = list(
  max_depth = 1,
  eta = 0.01
)
x = model.matrix(mpg~0+., mtcars)
train = xgb.DMatrix(x, label=mtcars$mpg)
res = xgb.cv(xgb_params, train, 100, prediction=TRUE, nfold=5)
print(head(res$pred))

## OOF Predictions in Python

### 1. using the sklearn wrapper for xgboost:

In [None]:
import xgboost as xgb
from sklearn.cross_validation import cross_val_predict as cvp
from sklearn import datasets
X = datasets.load_iris().data[:, :2]
y = datasets.load_iris().target
xgb_model = xgb.XGBRegressor()
y_pred = cvp(xgb_model, X, y, cv=3, n_jobs = 1)
y_pred

### 2. A Hacky Callback Solution

In [None]:
def oof_prediction():
    """
    Dirty global variable callback hack.
    """

    global cv_prediction_dict

    def callback(env):
        """internal function"""        
        cv_prediction_list = []

        for i in [0, 1, 2, 3, 4]:
            cv_prediction_list.append([env.cvfolds[i].bst.predict(env.cvfolds[i].dtest)])

        cv_prediction_dict['cv'] = cv_prediction_list

    return callback

Now we can call the callback from xgboost.cv() as follows.

In [None]:
cv_prediction_dict = {}
xgb.cv(xgb_params, train, 100, callbacks=[oof_prediction()]), nfold=5)
pos_oof_predictions = cv_prediction_dict.copy()

### 3. Model Callback Solution

In [None]:
import os
def cv_misc_callback(model_dir:str=None, oof_preds:list=None, maximize=True):
    """
    To reduce memory and disk storage, only best models and best oof preds and stored
    For classification, the preds are scores before applying sigmoid.
    """
    state = {}
    def init(env):
        if maximize:
            state['best_score'] = -np.inf
        else:
            state['best_score'] = np.inf
        if (model_dir is not None) and  (not os.path.isdir(model_dir)):
            os.mkdir(model_dir)

        if oof_preds is not None:
            for i, _ in enumerate(env.cvfolds):
                oof_preds.append(None)

    def callback(env):
        if not state:
            init(env)
        best_score = state['best_score']
        score = env.evaluation_result_list[-1][1]
        if (maximize and score > best_score) or (not maximize and score < best_score):
            for i, cvpack in enumerate(env.cvfolds):
                if model_dir is not None:
                    cvpack.bst.save_model(f'{model_dir}/{i}.model')
                if oof_preds is not None:
                    oof_preds[i] = cvpack.bst.predict(cvpack.dtest)
            state['best_score'] = score

    callback.before_iteration = False
    return callback

CV Code

In [None]:
eval_res = []
oof_preds = []
history = xgb.cv(params, dtrain, num_boost_round=1000,
                 folds=folds, early_stopping_rounds=40, seed=RANDOM_SEED,
                 callbacks=[cv_misc_callback('./models', oof_preds), xgb.callback.print_evaluation(period=10)])

Mapping preds list to oof_preds of train_data

In [None]:
oof_preds_proba = np.zeros(av_data.shape[0])
for i, (trn_idx, val_idx) in enumerate(folds):
    oof_preds_proba[val_idx] = sigmoid(oof_preds[i])

In [None]:
@jit
def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [None]:
import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

# Read Data
print("Reading Dataset...")
train = pd.read_csv("../input/train.csv")
target = np.array(train["target"])
target_log = np.log1p(target) # Log transform target as the evaluation metric uses it
xtrain = np.array(train.iloc[:,2:])
print("Shape of training data: {}".format(np.shape(xtrain)))

# Define Model 
xgb_model = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=70,
                         min_child_weight=100, subsample=1.0, 
                         colsample_bytree=0.8, colsample_bylevel=0.8,
                         random_state=42, n_jobs=4)

# Make OOF predictions using 5 folds
print("Cross Validating...")
oof_preds_log = cross_val_predict(xgb_model, xtrain, target_log, cv=5, 
                                  n_jobs=1, method="predict")
                                  
# Calculate RMSLE (RMSE of Log(1+y))
cv_rmsle = np.sqrt(mean_squared_error(target_log, oof_preds_log))
print("\nOOF RMSLE Score: {:.4f}".format(cv_rmsle))

In [None]:
print("Shape of training data: {}".format(np.shape(xtrain)))

In [None]:
def predicted_vs_actual_sale_price_xgb(self, xgb_params, x_train, y_train, seed, title_name):
        # Split the training data into an extra set of test
        x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train)
        dtrain_split = xgb.DMatrix(x_train_split, label=y_train_split)
        dtest_split = xgb.DMatrix(x_test_split)

        res = xgb.cv(xgb_params, dtrain_split, num_boost_round=1000, nfold=4, seed=seed, stratified=False,
                     early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

        best_nrounds = res.shape[0] - 1
        print(np.shape(x_train_split), np.shape(x_test_split), np.shape(y_train_split), np.shape(y_test_split))
        gbdt = xgb.train(xgb_params, dtrain_split, best_nrounds)
        y_predicted = gbdt.predict(dtest_split)
        plt.figure(figsize=(10, 5))
        plt.scatter(y_test_split, y_predicted, s=20)
        rmse_pred_vs_actual = self.rmse(y_predicted, y_test_split)
        plt.title(''.join([title_name, ', Predicted vs. Actual.', ' rmse = ', str(rmse_pred_vs_actual)]))
        plt.xlabel('Actual Sale Price')
        plt.ylabel('Predicted Sale Price')
        plt.plot([min(y_test_split), max(y_test_split)], [min(y_test_split), max(y_test_split)])
        plt.tight_layout()