# Idea

Research applicability of XGBoost package.

TODO:
- [x] Shall add Cut Offs
- [ ] Shall One model version based on CV

# Import Utils

In [1]:
from woe_utils import WOENumericalComplex

# Import Standard Libs

In [2]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

from keras import metrics # accuracy
from keras import backend as K

import keras_tuner as kt

import pandas as pd
from pandas.api.types import is_numeric_dtype

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.base import BaseEstimator 
from sklearn.base import RegressorMixin
from sklearn.metrics import log_loss,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import StratifiedKFold, KFold

import xgboost
from xgboost import XGBClassifier, XGBRFClassifier, DMatrix

import warnings
from tqdm.notebook import tqdm

import joblib
import os
import shutil
import itertools

pd.set_option('display.max_rows', 500)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Load the Data

In [3]:
features = pd.read_pickle('/kaggle/input/invitro-train-feature-engineer/features.pickle')
test = pd.read_pickle('/kaggle/input/invitro-train-feature-engineer/test_processed.pickle')
train = pd.read_pickle('/kaggle/input/invitro-train-feature-engineer/train_processed.pickle')

# Train Model

Today, we will use the defaults to create the Random Forest Model. By default the model is set to train for a classification task.
We will train a model for each fold and after training we will store the model and metrics. Here, we have chosen `accuracy` and `binary_crossentropy` as the metrics.

```python
xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85) 
```

In [4]:
class BalancedLogLoss(tf.keras.metrics.Metric):
    def __init__(self, name='balanced_log_loss', **kwargs):
        super(BalancedLogLoss, self).__init__(name=name, **kwargs)
        self.log_loss = self.add_weight(name='log_loss', initializer='zeros')

    def update_state(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None):
        
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        
        # Correct Values
        min_val = 1e-15
        max_val = 0.999999999999999
        
        y_pred = tf.math.minimum(y_pred, [max_val])
        y_pred = tf.math.maximum(y_pred, [min_val])
        
        log_y_pred_1 = tf.reshape(K.log(y_pred),[-1,1])
        log_y_pred_0 = tf.reshape(K.log(1-y_pred),[-1,1])

        y_1 = tf.reshape(y_true,[1,-1])
        y_0 = 1-y_1

        logloss_1 = -K.dot(y_1,log_y_pred_1)[0][0]/K.sum(y_1)
        logloss_0 = -K.dot(y_0,log_y_pred_0)[0][0]/K.sum(y_0)

        av_logloss = (logloss_1+logloss_0)/2
        
        self.log_loss.assign_add(av_logloss)

    def result(self):
        return self.log_loss

    def reset_state(self):
        # The state of the metric will be reset at the start of each epoch.
        self.log_loss.assign(0.)
        
def balanced_logloss_np(y_true: np.array, y_pred: np.array) -> float:
    
    # Correct Values
    min_val = 1e-15
    max_val = 0.999999999999999

    y_pred = np.minimum(y_pred, [max_val])
    y_pred = np.maximum(y_pred, [min_val])
    
    y_pred_1 = y_pred
    y_pred_0 = 1-y_pred

    log_y_pred_1 = np.reshape(np.log(y_pred_1),[-1,1])
    log_y_pred_0 = np.reshape(np.log(y_pred_0),[-1,1])

    y_1 = np.reshape(y_true,[1,-1])
    y_0 = (y_1-1)*(-1)

    logloss_1 = -np.dot(y_1,log_y_pred_1)[0][0]/np.sum(y_1)
    logloss_0 = -np.dot(y_0,log_y_pred_0)[0][0]/np.sum(y_0)

    av_logloss = (logloss_1+logloss_0)/2
    
    return av_logloss

def plot_train_logs(model) -> None:

    logs = model.make_inspector().training_logs()

    plt.figure(figsize=(12, 4))

    plt.subplot(1, 2, 1)
    plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
    plt.xlabel("Number of trees")
    plt.ylabel("Accuracy (out-of-bag)")

    plt.subplot(1, 2, 2)
    plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
    plt.xlabel("Number of trees")
    plt.ylabel("Logloss (out-of-bag)")

    plt.show()

class XGB_CV_Ensemble(RegressorMixin,BaseEstimator):
    def __init__(self, model_obj = XGBClassifier, label = "Class", predict_func = np.mean):
        self.label: str = label
        self.model_obj = model_obj
        
        # Empty
        self.X_summary: pd.DataFrame = pd.DataFrame()
        self.valid_summary: pd.DataFrame = pd.DataFrame()
        self.features: list = list()
        self.models: dict = dict()
        self.metrics: dict = dict()
        self.predict_func = predict_func
        self.cut_off_lower: float = 0.5
        self.cut_off_upper: float = 0.5
        self.cut_off: float = 0.5
        
    def _compute_weights(self, df: pd.DataFrame) -> dict:
        # Calculate the number of samples for each label.
        neg, pos = np.bincount(df[self.label])
        total = neg + pos
        weight_for_0 = (1 / neg) * (total / 2.0)
        weight_for_1 = (1 / pos) * (total / 2.0)
        class_weight = {0: weight_for_0, 1: weight_for_1}
        
        return class_weight
        
    def fit(self, X: pd.DataFrame, features: list, splitter = StratifiedKFold(),
            model_kwargs = dict()):

        n_splits = splitter.get_n_splits()

        # Create a various frames
        self.X_summary = pd.DataFrame(data=np.full((len(X.index),n_splits), np.nan), index=X.index) # For In-Sample Predictions of each Fold
        self.valid_summary = pd.DataFrame(data=np.full((len(X.index),1), np.nan), index=X.index) # For Out-of-Sample Prediction of each Fold
        self.features: list = features
        
        # Create an empty dictionary to store the models Xed for each fold.
        self.models = {}
        self.metrics = {}
        balanced_logloss_train = {}
        balanced_logloss_val = {}

        class_weight: dict = self._compute_weights(X)
        
        for i, (train_index, valid_index) in enumerate(splitter.split(X=X,y=X['Class'])):
                print('##### Fold',i+1)

                # Fetch values corresponding to the index 
                train_df = X.iloc[train_index]
                valid_df = X.iloc[valid_index]
                valid_ids = valid_df.index.values
                train_ids = train_df.index.values

                # Select only feature columns for training.
                train_df = train_df[self.features+[self.label]]
                valid_df = valid_df[self.features+[self.label]]

                # Define & Train the model and metrics
                model = self.model_obj(**model_kwargs)
                model.fit(X=train_df[self.features], y=train_df[self.label])

                # Store the model
                self.models[i] = model

                # Predict Values
                y_pred_train = model.predict_proba(X=train_df[self.features])[:,1]
                y_pred_valid = model.predict_proba(X=valid_df[self.features])[:,1]
                y_true_train = train_df[self.label].values
                y_true_valid = valid_df[self.label].values
                
                self.X_summary.loc[train_ids, i] = y_pred_train
                self.X_summary.loc[valid_ids, i] = y_pred_valid
                self.valid_summary.loc[valid_ids, 0] = y_pred_valid

                # Evaluate and store the metrics in respective dicts
                train_metric = balanced_logloss_np(y_pred=y_pred_train, y_true=y_true_train)
                val_metric = balanced_logloss_np(y_pred=y_pred_valid, y_true=y_true_valid)

                # Plot Results
#                 plot_train_logs(model)

                balanced_logloss_train[i] = train_metric
                balanced_logloss_val[i] = val_metric

                print(f"\nTrain: {train_metric:.4f} Validation: {val_metric:.4f}")

        self.metrics['train'] = balanced_logloss_train
        self.metrics['val'] = balanced_logloss_val

        print(f"\nTrain mean: {pd.Series(self.metrics['train']).mean():.4f} std: {pd.Series(self.metrics['train']).std():.4f}")
        print(f"\nValidation mean: {pd.Series(self.metrics['val']).mean():.4f} std: {pd.Series(self.metrics['val']).std():.4f}")
        
        return self
    
    def set_cut_offs(self, lower: float, upper: float):
        
        self.cut_off_lower: float = lower
        self.cut_off_upper: float = upper

    
    def predict_proba(self, X: pd.DataFrame, use_cut_offs: bool = True) -> pd.Series:
        n_splits = len(self.models)
        y_probas = pd.DataFrame(data=np.full((len(X.index),n_splits), np.nan),index=X.index) # For X (Sumbition) Predictions of each Fold's Model

        for i, model in enumerate(self.models.values()):
            y_probas[i] = model.predict_proba(X=X[self.features])[:,1] 
        
        y_proba: pd.Series = self.predict_func(y_probas, axis=1)
            
        y_proba.name = 'y_hat'
        
        # If cut-offs accepted
        if use_cut_offs:
            # lower
            y_proba = np.where(y_proba < self.cut_off_lower, 0, y_proba)
            # upper
            y_proba = np.where(y_proba > self.cut_off_upper, 1, y_proba)
        
        return y_proba
    
    def predict(self, X: pd.DataFrame) -> pd.Series:
        
        # Simple preidctins
        y_pred = self.predict_proba(X, use_cut_offs=False)
        
        # Round by Threshold
        y_pred = np.where(y_pred < self.cut_off,0,1)
        
        return y_pred
    
    
    def save(self, save_path: str) -> None:
        try:
            shutil.rmtree(save_path)
        except FileNotFoundError:
            pass
        else:
            pass
            
        os.makedirs(f'{save_path}/models', exist_ok=True)
        
        for fold, model in self.models.items():
            model.save_model(f'{save_path}/models/{fold}')
        
        joblib.dump(value=self.label, filename=f'{save_path}/label.pickle')
        joblib.dump(value=self.model_obj, filename=f'{save_path}/model_obj.pickle')
        
        joblib.dump(value=self.X_summary, filename=f'{save_path}/X_summary.pickle')
        joblib.dump(value=self.valid_summary, filename=f'{save_path}/valid_summary.pickle')
        joblib.dump(value=self.features, filename=f'{save_path}/features.pickle')
        joblib.dump(value=self.metrics, filename=f'{save_path}/metrics.pickle')
            
        return None
    
    def load(self, save_path: str):
        
        self.label = joblib.load(filename=f'{save_path}/label.pickle')
        self.model_obj = joblib.load(filename=f'{save_path}/model_obj.pickle')
        
        self.X_summary = joblib.load(filename=f'{save_path}/X_summary.pickle')
        self.valid_summary = joblib.load(filename=f'{save_path}/valid_summary.pickle')
        self.features = joblib.load(filename=f'{save_path}/features.pickle')
        self.metrics = joblib.load(filename=f'{save_path}/metrics.pickle')
        
        self.models = dict()
        
        for name in os.listdir(f'{save_path}/models'):
            i = name.split('.')[0]
            self.models[int(i)] = tf.keras.models.load_model(f'{save_path}/models/{i}',
                                              custom_objects={"BalancedLogLoss": BalancedLogLoss})
            
        return self

# Train

In [5]:
# XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85, verbosity=1)

# Test
my_splitter = StratifiedKFold(n_splits=6,shuffle=True, random_state=1902)

# initialise
CV_Ensemble_1 = XGB_CV_Ensemble(model_obj=XGBClassifier, label="Class")

# train
CV_Ensemble_1 = CV_Ensemble_1.fit(X=train, features=features, 
                                  splitter=my_splitter,
                                  model_kwargs=dict(
                                      n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,
                                      colsample_bytree=0.85, verbosity=1)
                                       )

# save
CV_Ensemble_1.save(save_path='/kaggle/working/XGB/1')

# train_summary_rf_3, valid_summary_rf_3, test_summary_rf_3, model_rf_3,metrics_rf_3 

# train_summary_rf_1 = CV_Ensemble_1.X_summary
# valid_summary_rf_1 = CV_Ensemble_1.valid_summary
# test_summary_rf_1 = CV_Ensemble_1.predict_proba(X=test_out, use_cut_offs=False)
# model_rf_1 = CV_Ensemble_1.models
# metrics_rf_1 = CV_Ensemble_1.metrics

##### Fold 1

Train: 0.0169 Validation: 0.3197
##### Fold 2

Train: 0.0165 Validation: 0.3352
##### Fold 3

Train: 0.0177 Validation: 0.3826
##### Fold 4

Train: 0.0175 Validation: 0.3242
##### Fold 5

Train: 0.0157 Validation: 0.4855
##### Fold 6

Train: 0.0163 Validation: 0.5578

Train mean: 0.0168 std: 0.0008

Validation mean: 0.4008 std: 0.0989


# Cut-Offs

In [6]:
def compute_cut_off(y_pred: pd.Series, y_true: pd.Series) -> pd.DataFrame:
    # Let's try to find Cutoffs organically
    y_pred.name = 'Pred'
    y_true.name = 'Class'

    tmp = pd.concat([y_pred, y_true], axis=1).sort_index(ascending=True)

    tmp['Pred_Bins'] = pd.qcut(x=tmp['Pred'],q=100)
    tmp = tmp.groupby('Pred_Bins')['Class'].agg({'sum','count'})
    tmp['count_cumsum'] = tmp['count'].cumsum()
    tmp['sum_cumsum'] = tmp['sum'].cumsum()

    tmp['bads_rate'] = tmp['sum_cumsum']/tmp['count_cumsum']

    tmp['perc_sum'] = [*range(1,101)]

    return tmp

In [7]:
y_pred = CV_Ensemble_1.predict_proba(train, use_cut_offs=False)
y_true = train['Class']

compute_cut_off(y_pred=y_pred, y_true=y_true)

Unnamed: 0_level_0,count,sum,count_cumsum,sum_cumsum,bads_rate,perc_sum
Pred_Bins,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-0.000903, 0.000138]",7,0,7,0,0.0,1
"(0.000138, 0.000192]",6,0,13,0,0.0,2
"(0.000192, 0.000231]",6,0,19,0,0.0,3
"(0.000231, 0.000276]",6,0,25,0,0.0,4
"(0.000276, 0.000294]",6,0,31,0,0.0,5
"(0.000294, 0.000322]",6,0,37,0,0.0,6
"(0.000322, 0.000349]",7,0,44,0,0.0,7
"(0.000349, 0.00039]",6,0,50,0,0.0,8
"(0.00039, 0.000418]",6,0,56,0,0.0,9
"(0.000418, 0.000453]",6,0,62,0,0.0,10


In [8]:
CV_Ensemble_1.set_cut_offs(lower=0.01,upper=0.9) # 95 perc 0.892 and 52% 0.11

print(CV_Ensemble_1.cut_off_lower)
print(CV_Ensemble_1.cut_off_upper)
 
y_pred_rf_train = CV_Ensemble_1.predict_proba(train, use_cut_offs=True)

y_pred_rf_train

print('Before')
print(balanced_logloss_np(y_pred=y_pred.values,y_true=y_true.values))
print('After')
print(balanced_logloss_np(y_pred=y_pred_rf_train,y_true=y_true.values))

0.01
0.9
Before
0.04835704356732542
After
0.035423191502991724


In [9]:
CV_Ensemble_1.save(save_path='/kaggle/working/XGB/1')

In [10]:
# Save Full

_ = joblib.dump(value=CV_Ensemble_1, filename='Full_Model.pickle')