# XGB


In [7]:
from sklearn.datasets import load_wine
data = load_wine()
np.unique(data.target )


array([0, 1, 2])

# KFold is an option

In [35]:
#%%time
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score,accuracy_score
import logging
format_str = '%(asctime)s %(filename)s[%(lineno)d] %(levelname)s %(message)s'
format = logging.Formatter(format_str)
logging.basicConfig(level=logging.DEBUG, format=format_str)
logger = logging.getLogger()

iris=datasets.load_iris()
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state = 5)



def train(X_data,  y_data,  X_test, cv=False ):
    
    num_fold = 5
    num_class = 3
    folds = KFold(n_splits=num_fold, shuffle=True, random_state=15)
    oof = np.zeros((len(y_data),num_class))
    predictions = np.zeros((len(X_test),num_class))
    #start = time.time()
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data.values, y_data.values)):
        logger.info("fold n°{}".format(fold_))
        

        trn_data = xgb.DMatrix(X_data.iloc[trn_idx], y_data.iloc[trn_idx])
        val_data = xgb.DMatrix(X_data.iloc[val_idx], y_data.iloc[val_idx])
        watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
        #np.random.seed(666)
        params = {'eta': 0.01, 
                  'max_depth': 11, 
                  'subsample': 0.8, 
                  'colsample_bytree': 0.8,                   
                  'objective': 'multi:softprob',
                  'num_class': num_class,
                  'eval_metric': 'mlogloss', 
                  'silent': True, 
                  'nthread': 4}
        
        num_round = 30000

        
        clf = xgb.train(params,
                          dtrain=trn_data, 
                          num_boost_round=num_round, 
                          evals=watchlist, 
                          early_stopping_rounds=200, 
                          verbose_eval=500, 
                           )
        oof[val_idx] = clf.predict(xgb.DMatrix(X_data.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
        

        #oof[val_idx] = clf.predict(X_data.iloc[val_idx], num_iteration=clf.best_iteration)
        
        score = accuracy_score(y_data.values[val_idx], oof.argmax(axis=1)[val_idx],)
        logger.info(f'fold n{fold_}, best_iter:{clf.best_iteration}, score:{score:6.4f} val shape:{X_data.iloc[val_idx].shape}')


        if cv:
            predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)
        else:
            logger.info('Cv is disable, will train with full train data')
            all_train = xgb.DMatrix(X_data, y_data)
            clf = xgb.train(params,
                           dtrain=all_train, 
                           num_boost_round=clf.best_ntree_limit, 
                          #evals=watchlist, 
                          #early_stopping_rounds=200, 
                          verbose_eval=500, 
                           )
            predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit)
            break
        
    predictions = predictions/(fold_ + 1)
    if cv:
        score = accuracy_score(y_data.values, oof.argmax(axis=1),)
    return predictions, score

train_data = pd.DataFrame(iris.data, columns=iris.feature_names)
train_target = pd.DataFrame(iris.target, columns=['label'])

X_train,X_test,y_train,y_test=train_test_split(train_data,train_target,test_size=0.3)

predictions, score = train(X_train,y_train,X_test,cv=True)

print(predictions.shape, score)
 

2019-04-17 06:03:29,492 <ipython-input-35-656408a59eee>[32] INFO fold n°0


[0]	train-mlogloss:1.08553	valid_data-mlogloss:1.08534
Multiple eval metrics have been passed: 'valid_data-mlogloss' will be used for early stopping.

Will train until valid_data-mlogloss hasn't improved in 200 rounds.
[500]	train-mlogloss:0.042973	valid_data-mlogloss:0.030253
[1000]	train-mlogloss:0.028812	valid_data-mlogloss:0.022452
[1500]	train-mlogloss:0.025265	valid_data-mlogloss:0.021684
[2000]	train-mlogloss:0.023576	valid_data-mlogloss:0.020708
[2500]	train-mlogloss:0.022735	valid_data-mlogloss:0.020126
[3000]	train-mlogloss:0.022149	valid_data-mlogloss:0.019587
[3500]	train-mlogloss:0.021808	valid_data-mlogloss:0.019356
Stopping. Best iteration:
[3619]	train-mlogloss:0.021757	valid_data-mlogloss:0.019316



2019-04-17 06:03:30,111 <ipython-input-35-656408a59eee>[65] INFO fold n0, best_iter:3619, score:1.0000 val shape:(21, 4)
2019-04-17 06:03:30,115 <ipython-input-35-656408a59eee>[32] INFO fold n°1


[0]	train-mlogloss:1.085	valid_data-mlogloss:1.08626
Multiple eval metrics have been passed: 'valid_data-mlogloss' will be used for early stopping.

Will train until valid_data-mlogloss hasn't improved in 200 rounds.
[500]	train-mlogloss:0.026665	valid_data-mlogloss:0.163585
Stopping. Best iteration:
[517]	train-mlogloss:0.025913	valid_data-mlogloss:0.163016



2019-04-17 06:03:30,273 <ipython-input-35-656408a59eee>[65] INFO fold n1, best_iter:517, score:0.9524 val shape:(21, 4)
2019-04-17 06:03:30,274 <ipython-input-35-656408a59eee>[32] INFO fold n°2


[0]	train-mlogloss:1.08517	valid_data-mlogloss:1.08504
Multiple eval metrics have been passed: 'valid_data-mlogloss' will be used for early stopping.

Will train until valid_data-mlogloss hasn't improved in 200 rounds.
[500]	train-mlogloss:0.039932	valid_data-mlogloss:0.068637
[1000]	train-mlogloss:0.026786	valid_data-mlogloss:0.06184
[1500]	train-mlogloss:0.023694	valid_data-mlogloss:0.05968
[2000]	train-mlogloss:0.022064	valid_data-mlogloss:0.058123
[2500]	train-mlogloss:0.021288	valid_data-mlogloss:0.057166
[3000]	train-mlogloss:0.020879	valid_data-mlogloss:0.056038
Stopping. Best iteration:
[3078]	train-mlogloss:0.02084	valid_data-mlogloss:0.055746



2019-04-17 06:03:30,862 <ipython-input-35-656408a59eee>[65] INFO fold n2, best_iter:3078, score:1.0000 val shape:(21, 4)
2019-04-17 06:03:30,866 <ipython-input-35-656408a59eee>[32] INFO fold n°3


[0]	train-mlogloss:1.08553	valid_data-mlogloss:1.08519
Multiple eval metrics have been passed: 'valid_data-mlogloss' will be used for early stopping.

Will train until valid_data-mlogloss hasn't improved in 200 rounds.
[500]	train-mlogloss:0.041934	valid_data-mlogloss:0.04188
Stopping. Best iteration:
[740]	train-mlogloss:0.032161	valid_data-mlogloss:0.036333



2019-04-17 06:03:31,078 <ipython-input-35-656408a59eee>[65] INFO fold n3, best_iter:740, score:1.0000 val shape:(21, 4)
2019-04-17 06:03:31,079 <ipython-input-35-656408a59eee>[32] INFO fold n°4


[0]	train-mlogloss:1.08548	valid_data-mlogloss:1.0853
Multiple eval metrics have been passed: 'valid_data-mlogloss' will be used for early stopping.

Will train until valid_data-mlogloss hasn't improved in 200 rounds.
[500]	train-mlogloss:0.043437	valid_data-mlogloss:0.024832
[1000]	train-mlogloss:0.029696	valid_data-mlogloss:0.013904
[1500]	train-mlogloss:0.02638	valid_data-mlogloss:0.011739
[2000]	train-mlogloss:0.024558	valid_data-mlogloss:0.010513
[2500]	train-mlogloss:0.023615	valid_data-mlogloss:0.01
[3000]	train-mlogloss:0.022976	valid_data-mlogloss:0.009582
[3500]	train-mlogloss:0.022571	valid_data-mlogloss:0.009378
[4000]	train-mlogloss:0.02228	valid_data-mlogloss:0.009254
[4500]	train-mlogloss:0.022054	valid_data-mlogloss:0.0091
Stopping. Best iteration:
[4647]	train-mlogloss:0.021983	valid_data-mlogloss:0.009026



2019-04-17 06:03:31,946 <ipython-input-35-656408a59eee>[65] INFO fold n4, best_iter:4647, score:1.0000 val shape:(21, 4)


(45, 3) 0.9904761904761905


In [6]:
xgb.train?

# Normal