# Read Before

- https://www.kaggle.com/code/raddar/icr-competition-analysis-and-findings/notebook
- https://www.tensorflow.org/guide/core/logistic_regression_core

Plan:
- [x] Feature Engineering (1 day)
- [x] CV and Model Selection (1 day)
- [x] Validation (1 day)
- [x] Review
- [ ] Found that some variables are constant as they represent some features for categroical column which takes only two values, so makes sense to drop them in order not overtrain. I will make split -> make two versions of model which is run if Group A (on all data), which is Group B ( which finetuned after )


## Install Private utils

In [1]:
!pip install auto-ml --no-index --find-links=file:///kaggle/input/private-utils

Looking in links: file:///kaggle/input/private-utils
Processing /kaggle/input/private-utils/auto_ml-0.1.0-py3-none-any.whl
Installing collected packages: auto-ml
Successfully installed auto-ml-0.1.0
[0m

In [2]:
from auto_ml.encoding_and_transforms import WOENumericalComplex

# Import the libraries

In [3]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

import keras_tuner as kt

import pandas as pd
from pandas.api.types import is_numeric_dtype

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import StratifiedKFold, KFold

import warnings
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 500)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

# Load the Dataset

In [5]:
dataset_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv', index_col='Id')
dataset_df.columns = dataset_df.columns.str.rstrip()
print("Full train dataset shape is {}".format(dataset_df.shape))

dataset_test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv', index_col='Id')
dataset_test_df.columns = dataset_test_df.columns.str.rstrip()
print("Full test dataset shape is {}".format(dataset_test_df.shape))

Full train dataset shape is (617, 57)
Full test dataset shape is (5, 56)


The data is composed of 58 columns and 617 entries. We can see all 58 dimensions(results will be truncated since the number of columns is big) of our dataset by printing out the first 5 entries using the following code:

# Feature Engineering

## Filter Monotonic Features

In [6]:
def compute_basic_stats(columns, df):
    out = {}
    
    for i in tqdm(columns):
        mask = df[i].notna()
        
        out[i] = {'nunique':df[i].nunique(),
                  'na_share':round(100*df[i].isna().sum()/df[i].count(),1),
                  'dtype':df[i].dtype
                 }
        if is_numeric_dtype(df[i]):
            out[i]['correlation'] = round(np.corrcoef(x=df.loc[mask,i],y=df.loc[mask,'Class'])[0,1],2)
            out[i]['min'] = df.loc[mask,i].min()
            out[i]['max'] = df.loc[mask,i].max()
            out[i]['std'] = df.loc[mask,i].std()
            out[i]['mean'] = df.loc[mask,i].mean()
            i_lorreg = LogisticRegression()
            X = df.loc[mask,i].values.reshape(-1,1)
            y = df.loc[mask,'Class'].values
            i_lorreg.fit(X=X, y=y)
            y_pred = i_lorreg.predict(X)
            out[i]['logloss'] = log_loss(y_true=y, y_pred=y_pred)
            
            
    out = pd.DataFrame(out).T
    
    out = out.sort_values('logloss',ascending=True)
    
    return out

basic_stats_1 = compute_basic_stats(
    columns=[i for i in dataset_df.columns if i not in ["Id","Class"]],         
    df=dataset_df)

  0%|          | 0/56 [00:00<?, ?it/s]

* Only one variable looks constrant over the target -> better to omit it.
* Realised Better to add LogLoss metric for each feature -> loggloss


## Normalise Features

In [7]:
def preprocess(train: pd.DataFrame, test: pd.DataFrame, stats: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, dict):
    info = dict()
    numeric_features = stats[stats.logloss.notna()].index.tolist() # Cause for EJ logloss is null
    train_out = train.copy()
    test_out = test.copy()
    
    out_features = list()
    
    # Make WoE Columns
    for i in tqdm(numeric_features, 'WoE Encoding: '):
        tmp_woe = WOENumericalComplex()
        tmp_woe.fit(x=train_out[i], y=train_out['Class'])
        train_out[i + '_WoE'] = tmp_woe.transform(X=train_out[i])
        test_out[i + '_WoE'] = tmp_woe.transform(X=test_out[i])
        out_features.append(i + '_WoE')
    
    
    # Make NA columns
    for i in tqdm(['DU', 'FC', 'FS', 'CC', 'FL', 'GL', 'CB', 'EL', 'BQ'], 'Split by NA: '):
        train_out[i+'_na'] = np.where(train_out[i].isna(),1,0)
        test_out[i+'_na'] = np.where(test_out[i].isna(),1,0)
        out_features.append(i + '_na')
    
    # Basic Logic -> normalise
    for i in tqdm(numeric_features,'Normalise Numeric: '):
        if stats.loc[i,'correlation'] > 0:
            na_value = stats.loc[i,'max']
        else:
            na_value = stats.loc[i,'min']

        train_out[i] = train_out[i].fillna(na_value)
        test_out[i] = test_out[i].fillna(na_value)

        train_out[i] = (train_out[i]-stats.loc[i,'mean'])/stats.loc[i,'std']
        test_out[i] = (test_out[i]-stats.loc[i,'mean'])/stats.loc[i,'std']

        out_features.append(i)
            
            
    # Addition EJ -> has only two values, so if EJ == 'A'
    train_out['EJ' + '_A'] = np.where(train_out['EJ'] == 'A',1,0)
    test_out['EJ' + '_A'] = np.where(test_out['EJ'] == 'A',1,0)
    out_features.append('EJ' + '_A')
    
    return train_out,test_out,out_features


train_out,test_out,features = preprocess(train=dataset_df, test=dataset_test_df, stats=basic_stats_1)

WoE Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

Split by NA:   0%|          | 0/9 [00:00<?, ?it/s]

Normalise Numeric:   0%|          | 0/55 [00:00<?, ?it/s]

In [8]:
basic_stats_2 = compute_basic_stats(
    columns=[i for i in train_out.columns if i not in ["Id","Class"]],         
    df=train_out)

basic_stats_2

  0%|          | 0/121 [00:00<?, ?it/s]

Unnamed: 0,nunique,na_share,dtype,correlation,min,max,std,mean,logloss
DU_WoE,4,0.0,float64,-0.52,-2.678782,2.977892,1.042059,0.286027,4.790242
FL_WoE,4,0.0,float64,-0.42,-2.045013,2.977892,0.86161,0.214552,5.374418
DI_WoE,3,0.0,float64,-0.37,-2.361247,0.382521,0.698812,0.124925,5.432836
DA_WoE,3,0.0,float64,-0.37,-2.093932,0.463416,0.719046,0.14452,5.549671
GL_WoE,4,0.0,float64,-0.4,-1.805209,2.977892,0.865805,0.230269,5.783341
FD_WoE,3,0.0,float64,-0.33,-1.996604,0.379845,0.635814,0.115675,5.783341
EH_WoE,3,0.0,float64,-0.32,-2.061142,0.343225,0.622624,0.111704,5.841759
EH,127,0.0,float64,0.18,-0.1635,22.876681,1.0,-0.0,5.900177
AM,605,0.0,float64,0.24,-0.513293,8.483647,1.0,0.0,5.900177
BC_WoE,3,0.0,float64,-0.3,-1.9811,0.486565,0.630381,0.116679,5.900177


# Select a Model

There are several tree-based models for you to choose from.

* RandomForestModel
* GradientBoostedTreesModel
* CartModel
* DistributedGradientBoostedTreesModel

To start, we'll work with a Random Forest. This is the most well-known of the Decision Forest training algorithms.

A Random Forest is a collection of decision trees, each trained independently on a random subset of the training dataset (sampled with replacement). The algorithm is unique in that it is robust to overfitting, and easy to use.

We can list the all the available models in TensorFlow Decision Forests using the following code:

In [9]:
tfdf.keras.get_all_models()

[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

# Train Model

Today, we will use the defaults to create the Random Forest Model. By default the model is set to train for a classification task.
We will train a model for each fold and after training we will store the model and metrics. Here, we have chosen `accuracy` and `binary_crossentropy` as the metrics.

In [10]:
def train_model(train: pd.DataFrame, test: pd.DataFrame, features: list, label = "Class",
                n_splits: int = 6,
                model_obj = tfdf.keras.RandomForestModel,
                model_kwargs = dict(),
                model_compile_kwargs = dict()) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, dict):

    # Create a dataframe of required size with zero values.
    oof = pd.DataFrame(data=np.zeros((len(train.index),1)), index=train.index)
    submition = pd.DataFrame(data=np.zeros((len(test.index),n_splits)),index=test.index)

    # Create an empty dictionary to store the models trained for each fold.
    models = {}

    # Create empty dict to save metircs for the models trained for each fold.
    accuracy = {}
    cross_entropy = {}
    
    # Calculate the number of samples for each label.
    neg, pos = np.bincount(train[label])
    total = neg + pos
    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)
    class_weight = {0: weight_for_0, 1: weight_for_1}

    print('Weight for class 0: {:.2f}'.format(weight_for_0))
    print('Weight for class 1: {:.2f}'.format(weight_for_1))
    
    # Names of columns
    submition_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)
    # Loop through each fold
    skf = StratifiedKFold(n_splits=n_splits)
    
    for i, (train_index, valid_index) in enumerate(skf.split(X=train,y=train['Class'])):
            print('##### Fold',i+1)

            # Fetch values corresponding to the index 
            train_df = train.iloc[train_index]
            valid_df = train.iloc[valid_index]
            valid_ids = valid_df.index.values

            # Select only feature columns for training.
            train_df = train_df[features+[label]]
            valid_df = valid_df[features+[label]]

            # We need to convert the datatset from Pandas format (pd.DataFrame)
            # into TensorFlow Datasets format (tf.data.Dataset).
            train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
            valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label=label)

            # Define & Train the model and metrics
            model = model_obj(**model_kwargs)
            model.compile(**model_compile_kwargs) 
            model.fit(x=train_ds, class_weight=class_weight)

            # Store the model
            models[f"fold_{i+1}"] = model

            # Predict value for validation/Submition data
            # Store the predictions in oof dataframe
            oof.loc[valid_ids, 0] = model.predict(x=valid_ds).flatten()
            submition[i] = model.predict(x=submition_ds).flatten() 

            # Evaluate and store the metrics in respective dicts
            evaluation = model.evaluate(x=valid_ds,return_dict=True)
            accuracy[f"fold_{i+1}"] = evaluation["accuracy"]
            cross_entropy[f"fold_{i+1}"] = evaluation["binary_crossentropy"]
            
    return submition,models,accuracy,cross_entropy


def print_average_accuracy(models,cross_entropy,accuracy):
    average_loss = 0
    average_acc = 0

    for _model in  models:
        average_loss += cross_entropy[_model]
        average_acc += accuracy[_model]
        print(f"{_model}: acc: {accuracy[_model]:.4f} loss: {cross_entropy[_model]:.4f}")

    print(f"\nAverage accuracy: {average_acc/len(models):.4f}  Average loss: {average_loss/len(models):.4f}")

In [11]:
# RandomForestModel
submition_1, model_1,accuracy_1,cross_entropy_1 = train_model(train=train_out,test=test_out, features=features,
                                                              n_splits=10, 
                                                              model_obj=tfdf.keras.RandomForestModel,
                                                              model_kwargs=dict(max_depth=6, num_trees=1000),
                                                              model_compile_kwargs=dict(metrics=["accuracy", "binary_crossentropy"]))

print('Type 1')
print_average_accuracy(models=model_1, cross_entropy=cross_entropy_1, accuracy=accuracy_1)

Weight for class 0: 0.61
Weight for class 1: 2.86
##### Fold 1
Use /tmp/tmpmt1kp804 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:09.567122. Found 555 examples.
Training model...
Model trained in 0:00:00.800597
Compiling model...


[INFO 23-07-14 21:42:09.0797 UTC kernel.cc:1242] Loading model from path /tmp/tmpmt1kp804/model/ with prefix a73193a8646644a5
[INFO 23-07-14 21:42:09.2214 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34210 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:09.2216 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-14 21:42:09.2217 UTC kernel.cc:1074] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
##### Fold 2
Use /tmp/tmpwjzumvc7 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.065911. Found 555 examples.
Training model...
Model trained in 0:00:00.753648
Compiling model...


[INFO 23-07-14 21:42:18.0118 UTC kernel.cc:1242] Loading model from path /tmp/tmpwjzumvc7/model/ with prefix a7069ee287b64fa8
[INFO 23-07-14 21:42:18.1513 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34512 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:18.1517 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 3
Use /tmp/tmp36feihp6 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.067853. Found 555 examples.
Training model...
Model trained in 0:00:00.774769
Compiling model...


[INFO 23-07-14 21:42:24.5504 UTC kernel.cc:1242] Loading model from path /tmp/tmp36feihp6/model/ with prefix 094b1736c4a34ffe
[INFO 23-07-14 21:42:24.6938 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34332 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:24.6940 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-14 21:42:24.6941 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 4
Use /tmp/tmp3ox2qtli as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.034917. Found 555 examples.
Training model...
Model trained in 0:00:00.740642
Compiling model...


[INFO 23-07-14 21:42:30.6493 UTC kernel.cc:1242] Loading model from path /tmp/tmp3ox2qtli/model/ with prefix ef6f991f512840cc
[INFO 23-07-14 21:42:30.7850 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34210 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:30.7851 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 5
Use /tmp/tmpkis0asko as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.099864. Found 555 examples.
Training model...
Model trained in 0:00:00.923149
Compiling model...


[INFO 23-07-14 21:42:37.7360 UTC kernel.cc:1242] Loading model from path /tmp/tmpkis0asko/model/ with prefix 3153935f9c6344a1
[INFO 23-07-14 21:42:37.8771 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34614 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:37.8772 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-14 21:42:37.8772 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 6
Use /tmp/tmptwszxqg1 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.047828. Found 555 examples.
Training model...
Model trained in 0:00:00.751859
Compiling model...


[INFO 23-07-14 21:42:43.8565 UTC kernel.cc:1242] Loading model from path /tmp/tmptwszxqg1/model/ with prefix 16c82d061fb1494b
[INFO 23-07-14 21:42:43.9959 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34414 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:43.9960 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 7
Use /tmp/tmpgqgl_ves as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.093338. Found 555 examples.
Training model...
Model trained in 0:00:00.751193
Compiling model...


[INFO 23-07-14 21:42:50.5206 UTC kernel.cc:1242] Loading model from path /tmp/tmpgqgl_ves/model/ with prefix 6746e12ffce3456a
[INFO 23-07-14 21:42:50.6593 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34550 node(s), and 114 input feature(s).
[INFO 23-07-14 21:42:50.6593 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-14 21:42:50.6594 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 8
Use /tmp/tmpxnt8shxp as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.111905. Found 556 examples.
Training model...
Model trained in 0:00:00.745257
Compiling model...


[INFO 23-07-14 21:42:56.7451 UTC kernel.cc:1242] Loading model from path /tmp/tmpxnt8shxp/model/ with prefix 0caabc5ab2484d14
[INFO 23-07-14 21:42:56.8819 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34432 node(s), and 113 input feature(s).
[INFO 23-07-14 21:42:56.8820 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 9
Use /tmp/tmpyyrujk04 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.108944. Found 556 examples.
Training model...
Model trained in 0:00:00.747010
Compiling model...


[INFO 23-07-14 21:43:02.9625 UTC kernel.cc:1242] Loading model from path /tmp/tmpyyrujk04/model/ with prefix 30b1ebb10cde4b8a
[INFO 23-07-14 21:43:03.1036 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34142 node(s), and 113 input feature(s).
[INFO 23-07-14 21:43:03.1037 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-14 21:43:03.1037 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 10
Use /tmp/tmpubfd2fer as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.207174. Found 556 examples.
Training model...
Model trained in 0:00:00.936267
Compiling model...


[INFO 23-07-14 21:43:09.5036 UTC kernel.cc:1242] Loading model from path /tmp/tmpubfd2fer/model/ with prefix 766c87ed10b144ca
[INFO 23-07-14 21:43:09.6428 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34820 node(s), and 113 input feature(s).
[INFO 23-07-14 21:43:09.6431 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
Type 1
fold_1: acc: 0.9839 loss: 0.2010
fold_2: acc: 0.9355 loss: 0.2160
fold_3: acc: 0.8548 loss: 0.2781
fold_4: acc: 0.8710 loss: 0.2768
fold_5: acc: 0.9355 loss: 0.2335
fold_6: acc: 0.9355 loss: 0.2744
fold_7: acc: 0.9194 loss: 0.2641
fold_8: acc: 0.9344 loss: 0.2546
fold_9: acc: 0.9344 loss: 0.2580
fold_10: acc: 0.9672 loss: 0.2148

Average accuracy: 0.9272  Average loss: 0.2471


In [12]:
def train_model(train: pd.DataFrame, test: pd.DataFrame, features: list, label = "Class",
                n_splits: int = 6,
                model_obj = tfdf.keras.RandomForestModel,
                model_kwargs = dict(),
                model_compile_kwargs = dict()) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, dict):

    # Create a dataframe of required size with zero values.
    oof = pd.DataFrame(data=np.zeros((len(train.index),1)), index=train.index)
    submition = pd.DataFrame(data=np.zeros((len(test.index),n_splits)),index=test.index)

    # Create an empty dictionary to store the models trained for each fold.
    models = {}

    # Create empty dict to save metircs for the models trained for each fold.
    accuracy = {}
    cross_entropy = {}
    
    # Calculate the number of samples for each label.
    neg, pos = np.bincount(train[label])
    total = neg + pos
    class_weight = {0: (1 / neg) * (total / 2.0), 1: (1 / pos) * (total / 2.0)}

    print('Weight for class 0: {:.2f}'.format(class_weight[0]))
    print('Weight for class 1: {:.2f}'.format(class_weight[1]))
    
    # Names of columns
    submition_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)
    # Loop through each fold
    skf = StratifiedKFold(n_splits=n_splits)
    
    for i, (train_index, valid_index) in enumerate(skf.split(X=train,y=train['Class'])):
            print('##### Fold',i+1)

            # Fetch values corresponding to the index 
            train_df = train.iloc[train_index]
            valid_df = train.iloc[valid_index]
            valid_ids = valid_df.index.values

            # Select only feature columns for training.
            train_df = train_df[features+[label]]
            valid_df = valid_df[features+[label]]

            # We need to convert the datatset from Pandas format (pd.DataFrame)
            # into TensorFlow Datasets format (tf.data.Dataset).
            train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
            valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label=label)

            # Define & Train the model and metrics
            model = model_obj(**model_kwargs)
            model.compile(**model_compile_kwargs) 
            model.fit(x=train_ds, class_weight=class_weight)

            # Store the model
            models[f"fold_{i+1}"] = model

            # Predict value for validation/Submition data
            # Store the predictions in oof dataframe
            oof.loc[valid_ids, 0] = model.predict(x=valid_ds).flatten()
#             submition.loc[:, i] = model.predict(x=submition_ds).flatten() 
            submition[i] = model.predict(x=submition_ds).flatten() 

            # Evaluate and store the metrics in respective dicts
            evaluation = model.evaluate(x=valid_ds,return_dict=True)
            accuracy[f"fold_{i+1}"] = evaluation["accuracy"]
            cross_entropy[f"fold_{i+1}"] = evaluation["binary_crossentropy"]
            
    return submition,models,accuracy,cross_entropy


def print_average_accuracy(models,cross_entropy,accuracy):
    average_loss = 0
    average_acc = 0

    for _model in  models:
        average_loss += cross_entropy[_model]
        average_acc += accuracy[_model]
        print(f"{_model}: acc: {accuracy[_model]:.4f} loss: {cross_entropy[_model]:.4f}")

    print(f"\nAverage accuracy: {average_acc/len(models):.4f}  Average loss: {average_loss/len(models):.4f}")

In [13]:
# GradientBoostedTreesModel
submition_2, model_2,accuracy_2,cross_entropy_2 = train_model(train=train_out,test=test_out, features=features,
                                                              n_splits=3, 
                                                              model_obj=tfdf.keras.GradientBoostedTreesModel,
                                                              model_kwargs=dict(max_depth=1,
                                                                                num_trees=3000,
                                                                                sampling_method='GOSS'),
                                                              model_compile_kwargs=dict(metrics=["accuracy", "binary_crossentropy"]))

print('Type 1')
print_average_accuracy(models=model_2, cross_entropy=cross_entropy_2, accuracy=accuracy_2)

Weight for class 0: 0.61
Weight for class 1: 2.86
##### Fold 1
Use /tmp/tmpctkvbudc as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.159345. Found 411 examples.
Training model...
Model trained in 0:00:00.044359
Compiling model...


[INFO 23-07-14 21:43:17.0752 UTC kernel.cc:1242] Loading model from path /tmp/tmpctkvbudc/model/ with prefix 7f88d4b6692148b5
[INFO 23-07-14 21:43:17.0762 UTC abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 23-07-14 21:43:17.0763 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 2
Use /tmp/tmpssbjuzlo as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.107840. Found 411 examples.
Training model...
Model trained in 0:00:00.048196
Compiling model...


[INFO 23-07-14 21:43:24.1868 UTC kernel.cc:1242] Loading model from path /tmp/tmpssbjuzlo/model/ with prefix 40ac9d65cd66431a
[INFO 23-07-14 21:43:24.1877 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 3
Use /tmp/tmpmqcim2sz as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.171954. Found 412 examples.
Training model...
Model trained in 0:00:00.043361
Compiling model...


[INFO 23-07-14 21:43:30.9505 UTC kernel.cc:1242] Loading model from path /tmp/tmpmqcim2sz/model/ with prefix 673b651efa2c4d76
[INFO 23-07-14 21:43:30.9514 UTC abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 23-07-14 21:43:30.9515 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
Type 1
fold_1: acc: 0.8252 loss: 0.6741
fold_2: acc: 0.8252 loss: 0.6699
fold_3: acc: 0.1756 loss: 0.7170

Average accuracy: 0.6087  Average loss: 0.6870


In [14]:
# GradientBoostedTreesModel
submition_3, model_3,accuracy_3,cross_entropy_3 = train_model(train=train_out,test=test_out, features=features,
                                                              n_splits=3, 
                                                              model_obj=tfdf.keras.GradientBoostedTreesModel,
                                                              model_kwargs=dict(max_depth=1,
                                                                                num_trees=3000,
                                                                                sampling_method='GOSS'),
                                                              model_compile_kwargs=dict(metrics=["accuracy", "binary_crossentropy"]))

print('Type 1')
print_average_accuracy(models=model_2, cross_entropy=cross_entropy_2, accuracy=accuracy_2)

Weight for class 0: 0.61
Weight for class 1: 2.86
##### Fold 1
Use /tmp/tmp3sv5w3jq as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.046136. Found 411 examples.
Training model...
Model trained in 0:00:00.041085
Compiling model...


[INFO 23-07-14 21:43:37.7627 UTC kernel.cc:1242] Loading model from path /tmp/tmp3sv5w3jq/model/ with prefix 87a2f83f794647c1
[INFO 23-07-14 21:43:37.7637 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 2
Use /tmp/tmp7al1hgkj as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.084737. Found 411 examples.
Training model...
Model trained in 0:00:00.052540
Compiling model...


[INFO 23-07-14 21:43:44.4770 UTC kernel.cc:1242] Loading model from path /tmp/tmp7al1hgkj/model/ with prefix 407415812c4e44bd
[INFO 23-07-14 21:43:44.4783 UTC abstract_model.cc:1311] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 23-07-14 21:43:44.4783 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 3
Use /tmp/tmpdeu8ulbb as temporary training directory
Reading training dataset...




Training dataset read in 0:00:02.107809. Found 412 examples.
Training model...
Model trained in 0:00:00.039914
Compiling model...


[INFO 23-07-14 21:43:51.0820 UTC kernel.cc:1242] Loading model from path /tmp/tmpdeu8ulbb/model/ with prefix 393a389c341e421c
[INFO 23-07-14 21:43:51.0832 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
Type 1
fold_1: acc: 0.8252 loss: 0.6741
fold_2: acc: 0.8252 loss: 0.6699
fold_3: acc: 0.1756 loss: 0.7170

Average accuracy: 0.6087  Average loss: 0.6870


# Visualize the model
One benefit of tree-based models is that we can easily visualize them. The default number of trees used in the Random Forests is 300. We can select any tree for display.

Let us pick one model from the `models` dict and select a tree for display.

In [15]:
# tfdf.model_plotter.plot_model_in_colab(model_2['fold_1'], tree_idx=20, max_depth=10)

# Evaluate the model on the Out of bag (OOB) data and the validation dataset

Before training, we have manually seperated 20% of the dataset for validation named as `valid_ds`.

We can also use Out of bag (OOB) score to validate our RandomForestModel.
To train a Random Forest Model, a set of random samples from training set are choosen by the algorithm and the rest of the samples are used to finetune the model. The subset of data that is not chosen is known as Out of bag data (OOB).
OOB score is computed on the OOB data.

Read more about OOB data [here](https://developers.google.com/machine-learning/decision-forests/out-of-bag).

The training logs show the `binary_crossentropy` evaluated on the out of bag dataset according to the number of trees in the model. Let us plot this for the models of each fold.

Note: Smaller values are better for this hyperparameter.

In [16]:
# figure, axis = plt.subplots(3, 2, figsize=(10, 10))
# plt.subplots_adjust(hspace=0.5, wspace=0.3)

# for i, fold_no in enumerate(model_2.keys()):
#     row = i//2
#     col = i % 2
#     logs = model_1[fold_no].make_inspector().training_logs()
#     axis[row, col].plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
#     axis[row, col].set_title(f"Fold {i+1}")
#     axis[row, col].set_xlabel('Number of trees')
#     axis[row, col].set_ylabel('Loss (out-of-bag)')

# # axis[2][1].set_visible(False)
# plt.show()

We can also see some general stats on the OOB dataset:

In [17]:
# for _model in model_1:
#     inspector = model_1[_model].make_inspector()
#     print(_model, inspector.evaluation())

# Variable importances

Variable importances generally indicate how much a feature contributes to the model predictions or quality. There are several ways to identify important features using TensorFlow Decision Forests. Let us pick one model from models dict and inspect it.

Let us list the available `Variable Importances` for Decision Trees:

In [18]:
# inspector = model_1['fold_1'].make_inspector()

# print(f"Available variable importances:")
# for importance in inspector.variable_importances().keys():
#     print("\t", importance)

As an example, let us display the important features for the Variable Importance `NUM_AS_ROOT`.

The larger the importance score for `NUM_AS_ROOT`, the more impact it has on the outcome of the model.

By default, the list is sorted from the most important to the least. From the output you can infer that the feature at the top of the list is used as the root node in most number of trees in the random forest than any other feature.

In [19]:
# Each line is: (feature name, (index of the feature), importance score)
# inspector.variable_importances()["NUM_AS_ROOT"]

# Submission

In [20]:
!rm -rf AutoML-main

In [21]:
def create_submitions(submition: pd.DataFrame) -> pd.DataFrame:
    
    df = pd.DataFrame(data=np.zeros((len(submition.index),2)),index=submition.index, columns=['class_0','class_1'])
    
    df['class_1'] = submition_1.mean(axis=1)
    df['class_0'] = 1 - df['class_1']
    
    return df

submition_1_final = create_submitions(submition=submition_1)

submition_1_final.to_csv('/kaggle/working/submission.csv', index=True)

In [22]:
submition_1_final

Unnamed: 0_level_0,class_0,class_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
00eed32682bb,0.656801,0.343199
010ebe33f668,0.656801,0.343199
02fa521e1838,0.656801,0.343199
040e15f562a2,0.656801,0.343199
046e85c7cc7f,0.656801,0.343199
