# Read Before

- https://www.kaggle.com/code/raddar/icr-competition-analysis-and-findings/notebook
- https://www.tensorflow.org/guide/core/logistic_regression_core
- https://www.kaggle.com/code/muelsamu/simple-tabpfn-approach-for-score-of-15-in-1-min/notebook

Plan:
- [x] Feature Engineering (1 day)
- [x] CV and Model Selection (1 day)
- [x] Validation (1 day)
- [x] Review
- [x] Make Artefacts -> Made Utility script for WoE
- [x] Solve Error With Solution -> Made If new catgory then choose worst WoEs (Can make two splits woth worst and other value )
- [x] Add TabPFN (Added Private Sample with package files) and No CV TabPFN preds
- [x] Added Weighted Submition with respect to Competion Metric Mean
- [ ] Found that some variables are constant as they represent some features for categroical column which takes only two values, so makes sense to drop them in order not overtrain. I will make split -> make two versions of model which is run if Group A (on all data), which is Group B ( which finetuned after )

## Install TabPFN offline

In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr

!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff

!cp /kaggle/input/pip-packages-icr/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr
Processing /kaggle/input/pip-packages-icr/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9
[0m

# Import Utils

In [2]:
from woe_utils import WOENumericalComplex

# Import Standard Libs

In [3]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf

from keras import metrics
from keras import backend as K

import keras_tuner as kt

import pandas as pd
from pandas.api.types import is_numeric_dtype

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import StratifiedKFold, KFold

import warnings
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 500)

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


# Load the Dataset

In [4]:
dataset_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv', index_col='Id')
dataset_df.columns = dataset_df.columns.str.rstrip()
print("Full train dataset shape is {}".format(dataset_df.shape))

dataset_test_df = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv', index_col='Id')
dataset_test_df.columns = dataset_test_df.columns.str.rstrip()
print("Full test dataset shape is {}".format(dataset_test_df.shape))

Full train dataset shape is (617, 57)
Full test dataset shape is (5, 56)


# Feature Engineering

## Compute Basic Info

In [5]:
def compute_basic_stats(columns, df):
    out = {}
    
    for i in tqdm(columns):
        mask = df[i].notna()
        
        out[i] = {'nunique':df[i].nunique(),
                  'na_share':round(100*df[i].isna().sum()/df[i].count(),1),
                  'dtype':df[i].dtype
                 }
        if is_numeric_dtype(df[i]):
            out[i]['correlation'] = round(np.corrcoef(x=df.loc[mask,i],y=df.loc[mask,'Class'])[0,1],2)
            out[i]['min'] = df.loc[mask,i].min()
            out[i]['max'] = df.loc[mask,i].max()
            out[i]['std'] = df.loc[mask,i].std()
            out[i]['mean'] = df.loc[mask,i].mean()
            i_lorreg = LogisticRegression()
            X = df.loc[mask,i].values.reshape(-1,1)
            y = df.loc[mask,'Class'].values
            i_lorreg.fit(X=X, y=y)
            y_pred = i_lorreg.predict(X)
            out[i]['logloss'] = log_loss(y_true=y, y_pred=y_pred)
            
            
    out = pd.DataFrame(out).T
    
    out = out.sort_values('logloss',ascending=True)
    
    return out

basic_stats_1 = compute_basic_stats(
    columns=[i for i in dataset_df.columns if i not in ["Id","Class"]],         
    df=dataset_df)

  0%|          | 0/56 [00:00<?, ?it/s]

* Only one variable looks constrant over the target -> better to omit it.
* Realised Better to add LogLoss metric for each feature -> loggloss


## Create Features

In [6]:
def preprocess(train: pd.DataFrame, test: pd.DataFrame, stats: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, dict):
    info = dict()
    woes = dict()
    numeric_features = stats[stats.logloss.notna()].index.tolist() # Cause for EJ logloss is null
    train_out = train.copy()
    test_out = test.copy()
    
    out_features = list()
    
    # Make WoE Columns
    for i in tqdm(numeric_features, 'WoE Encoding: '):
        tmp_woe = WOENumericalComplex()
        tmp_woe.fit(x=train_out[i], y=train_out['Class'])
        train_out[i + '_WoE'] = tmp_woe.transform(X=train_out[i])
        test_out[i + '_WoE'] = tmp_woe.transform(X=test_out[i])
        out_features.append(i + '_WoE')
        woes[i] = tmp_woe
    
    
    # Make NA columns
    for i in tqdm(['DU', 'FC', 'FS', 'CC', 'FL', 'GL', 'CB', 'EL', 'BQ'], 'Split by NA: '):
        train_out[i+'_na'] = np.where(train_out[i].isna(),1,0)
        test_out[i+'_na'] = np.where(test_out[i].isna(),1,0)
        out_features.append(i + '_na')
    
    # Basic Logic -> normalise
    for i in tqdm(numeric_features,'Normalise Numeric: '):
        if stats.loc[i,'correlation'] > 0:
            na_value = stats.loc[i,'max']
        else:
            na_value = stats.loc[i,'min']

        train_out[i] = train_out[i].fillna(na_value)
        test_out[i] = test_out[i].fillna(na_value)

        train_out[i] = (train_out[i]-stats.loc[i,'mean'])/stats.loc[i,'std']
        test_out[i] = (test_out[i]-stats.loc[i,'mean'])/stats.loc[i,'std']

        out_features.append(i)
            
            
    # Addition EJ -> has only two values, so if EJ == 'A'
    train_out['EJ' + '_A'] = np.where(train_out['EJ'] == 'A',1,0)
    test_out['EJ' + '_A'] = np.where(test_out['EJ'] == 'A',1,0)
    out_features.append('EJ' + '_A')
    
    return train_out,test_out,out_features, woes


train_out,test_out,features, woes = preprocess(train=dataset_df, test=dataset_test_df, stats=basic_stats_1)

WoE Encoding:   0%|          | 0/55 [00:00<?, ?it/s]

Split by NA:   0%|          | 0/9 [00:00<?, ?it/s]

Normalise Numeric:   0%|          | 0/55 [00:00<?, ?it/s]

In [7]:
basic_stats_2 = compute_basic_stats(
    columns=[i for i in train_out.columns if i not in ["Id","Class"]],         
    df=train_out)

basic_stats_2

  0%|          | 0/121 [00:00<?, ?it/s]

Unnamed: 0,nunique,na_share,dtype,correlation,min,max,std,mean,logloss
DU_WoE,4,0.0,float64,-0.52,-2.678782,2.977892,1.042059,0.286027,4.790242
FL_WoE,4,0.0,float64,-0.42,-2.045013,2.977892,0.86161,0.214552,5.374418
DI_WoE,3,0.0,float64,-0.37,-2.361247,0.382521,0.698812,0.124925,5.432836
DA_WoE,3,0.0,float64,-0.37,-2.093932,0.463416,0.719046,0.14452,5.549671
GL_WoE,4,0.0,float64,-0.4,-1.805209,2.977892,0.865805,0.230269,5.783341
FD_WoE,3,0.0,float64,-0.33,-1.996604,0.379845,0.635814,0.115675,5.783341
EH_WoE,3,0.0,float64,-0.32,-2.061142,0.343225,0.622624,0.111704,5.841759
EH,127,0.0,float64,0.18,-0.1635,22.876681,1.0,-0.0,5.900177
AM,605,0.0,float64,0.24,-0.513293,8.483647,1.0,0.0,5.900177
BC_WoE,3,0.0,float64,-0.3,-1.9811,0.486565,0.630381,0.116679,5.900177


# Train Model

Today, we will use the defaults to create the Random Forest Model. By default the model is set to train for a classification task.
We will train a model for each fold and after training we will store the model and metrics. Here, we have chosen `accuracy` and `binary_crossentropy` as the metrics.

In [8]:
def train_model(train: pd.DataFrame, test: pd.DataFrame, features: list, label = "Class",
                n_splits: int = 6,
                model_obj = tfdf.keras.RandomForestModel,
                model_kwargs = dict(),
                model_compile_kwargs = dict()) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, dict):

    # Create a dataframe of required size with zero values.
    oof = pd.DataFrame(data=np.zeros((len(train.index),1)), index=train.index)
    submition = pd.DataFrame(data=np.zeros((len(test.index),n_splits)),index=test.index)

    # Create an empty dictionary to store the models trained for each fold.
    models = {}

    # Create empty dict to save metircs for the models trained for each fold.
    accuracy = {}
    cross_entropy = {}
    balanced_logloss = {}
    
    # Calculate the number of samples for each label.
    neg, pos = np.bincount(train[label])
    total = neg + pos
    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)
    class_weight = {0: weight_for_0, 1: weight_for_1}

    print('Weight for class 0: {:.2f}'.format(weight_for_0))
    print('Weight for class 1: {:.2f}'.format(weight_for_1))
    
    # Names of columns
    submition_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)
    # Loop through each fold
    skf = StratifiedKFold(n_splits=n_splits)
    
    for i, (train_index, valid_index) in enumerate(skf.split(X=train,y=train['Class'])):
            print('##### Fold',i+1)

            # Fetch values corresponding to the index 
            train_df = train.iloc[train_index]
            valid_df = train.iloc[valid_index]
            valid_ids = valid_df.index.values

            # Select only feature columns for training.
            train_df = train_df[features+[label]]
            valid_df = valid_df[features+[label]]

            # We need to convert the datatset from Pandas format (pd.DataFrame)
            # into TensorFlow Datasets format (tf.data.Dataset).
            train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_df, label=label)
            valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_df, label=label)

            # Define & Train the model and metrics
            model = model_obj(**model_kwargs)
            model.compile(**model_compile_kwargs) 
            model.fit(x=train_ds, class_weight=class_weight)

            # Store the model
            models[f"fold_{i+1}"] = model

            # Predict value for validation/Submition data
            # Store the predictions in oof dataframe
            oof.loc[valid_ids, 0] = model.predict(x=valid_ds).flatten()
            submition[i] = model.predict(x=submition_ds).flatten() 

            # Evaluate and store the metrics in respective dicts
            evaluation = model.evaluate(x=valid_ds,return_dict=True)
            accuracy[f"fold_{i+1}"] = evaluation["accuracy"]
            cross_entropy[f"fold_{i+1}"] = evaluation["binary_crossentropy"]
            balanced_logloss[f"fold_{i+1}"] = evaluation["balanced_logloss"]
            
    return submition,models,accuracy,cross_entropy,balanced_logloss

def balanced_logloss(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
    # y_true is prob that y is equals to 1, we assume that final probs would be P(class_1) = 1 - P(class_0)
    y_pred_1 = y_pred
    y_pred_0 = 1-y_pred

    log_y_pred_1 = tf.reshape(K.log(y_pred_1),[-1,1])
    log_y_pred_0 = tf.reshape(K.log(y_pred_0),[-1,1])

    y_1 = tf.reshape(y_true,[1,-1])
    y_0 = (y_1-1)*(-1)

    logloss_1 = -K.dot(y_1,log_y_pred_1)[0][0]/K.sum(y_1)
    logloss_0 = -K.dot(y_0,log_y_pred_0)[0][0]/K.sum(y_0)

    av_logloss = (logloss_1+logloss_0)/2
    
    return av_logloss

def print_average_accuracy(models,cross_entropy,accuracy,balanced_logloss):
    average_loss = 0
    average_acc = 0
    average_bal = 0

    for _model in  models:
        average_loss += cross_entropy[_model]
        average_acc += accuracy[_model]
        average_bal += balanced_logloss[_model]
        
        print(f"{_model}: acc: {accuracy[_model]:.4f} loss: {cross_entropy[_model]:.4f} balanced loss: {balanced_logloss[_model]:.4f}")

    print(f"\nAverage accuracy: {average_acc/len(models):.4f}  Average loss: {average_loss/len(models):.4f} Average balanced loss: {average_bal/len(models):.4f}")

In [9]:
# RandomForestModel
submition_1, model_1,accuracy_1,cross_entropy_1, bal_logloss_1 = train_model(
    train=train_out,test=test_out, features=features,                                             
    n_splits=10,                    
    model_obj=tfdf.keras.RandomForestModel,           
    model_kwargs=dict(max_depth=6, num_trees=1000),                                                  
    model_compile_kwargs=dict(metrics=[metrics.accuracy, metrics.binary_crossentropy,balanced_logloss]))
# metrics
print('Type 1')
print_average_accuracy(models=model_1, cross_entropy=cross_entropy_1, accuracy=accuracy_1, balanced_logloss=bal_logloss_1)

Weight for class 0: 0.61
Weight for class 1: 2.86
##### Fold 1
Use /tmp/tmpx08krtsj as temporary training directory
Reading training dataset...
Training dataset read in 0:00:09.108487. Found 555 examples.
Training model...
Model trained in 0:00:00.785434
Compiling model...


[INFO 23-07-19 08:10:24.7263 UTC kernel.cc:1242] Loading model from path /tmp/tmpx08krtsj/model/ with prefix db2f240de5c04f89
[INFO 23-07-19 08:10:24.8647 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34210 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:24.8649 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-19 08:10:24.8649 UTC kernel.cc:1074] Use fast generic engine


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
##### Fold 2
Use /tmp/tmpawoeuqv0 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.141483. Found 555 examples.
Training model...
Model trained in 0:00:00.929544
Compiling model...


[INFO 23-07-19 08:10:34.3922 UTC kernel.cc:1242] Loading model from path /tmp/tmpawoeuqv0/model/ with prefix 511113b9d2a1413d
[INFO 23-07-19 08:10:34.5279 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34512 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:34.5280 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 3
Use /tmp/tmp19atz4ks as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.052767. Found 555 examples.
Training model...
Model trained in 0:00:00.736610
Compiling model...


[INFO 23-07-19 08:10:40.6284 UTC kernel.cc:1242] Loading model from path /tmp/tmp19atz4ks/model/ with prefix 1f9c88268ed84e50
[INFO 23-07-19 08:10:40.7634 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34332 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:40.7636 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-19 08:10:40.7638 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 4
Use /tmp/tmpl_x_9dnx as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.072811. Found 555 examples.
Training model...
Model trained in 0:00:00.730514
Compiling model...


[INFO 23-07-19 08:10:46.8769 UTC kernel.cc:1242] Loading model from path /tmp/tmpl_x_9dnx/model/ with prefix a4761e35dbaf4eef
[INFO 23-07-19 08:10:47.0116 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34210 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:47.0119 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 5
Use /tmp/tmp546dqeow as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.048419. Found 555 examples.
Training model...
Model trained in 0:00:00.745758
Compiling model...


[INFO 23-07-19 08:10:53.1926 UTC kernel.cc:1242] Loading model from path /tmp/tmp546dqeow/model/ with prefix f4a23422ac4a44c9
[INFO 23-07-19 08:10:53.3282 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34614 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:53.3282 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-19 08:10:53.3282 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 6
Use /tmp/tmp59cwg31x as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.055480. Found 555 examples.
Training model...
Model trained in 0:00:00.744051
Compiling model...


[INFO 23-07-19 08:10:59.8461 UTC kernel.cc:1242] Loading model from path /tmp/tmp59cwg31x/model/ with prefix 6513de7f09c643d8
[INFO 23-07-19 08:10:59.9851 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34414 node(s), and 113 input feature(s).
[INFO 23-07-19 08:10:59.9851 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 7
Use /tmp/tmpslfgh3o8 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.154309. Found 555 examples.
Training model...
Model trained in 0:00:00.835747
Compiling model...


[INFO 23-07-19 08:11:06.3564 UTC kernel.cc:1242] Loading model from path /tmp/tmpslfgh3o8/model/ with prefix b02de2eb4969429c
[INFO 23-07-19 08:11:06.4936 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34550 node(s), and 114 input feature(s).
[INFO 23-07-19 08:11:06.4939 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-19 08:11:06.4940 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 8
Use /tmp/tmplug6om17 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.074117. Found 556 examples.
Training model...
Model trained in 0:00:00.735596
Compiling model...


[INFO 23-07-19 08:11:13.1525 UTC kernel.cc:1242] Loading model from path /tmp/tmplug6om17/model/ with prefix 3cde98aa69c5433f
[INFO 23-07-19 08:11:13.2857 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34432 node(s), and 113 input feature(s).
[INFO 23-07-19 08:11:13.2858 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 9
Use /tmp/tmp8b8m0467 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.129241. Found 556 examples.
Training model...
Model trained in 0:00:00.734199
Compiling model...


[INFO 23-07-19 08:11:19.4519 UTC kernel.cc:1242] Loading model from path /tmp/tmp8b8m0467/model/ with prefix 46b7782e73e246f3
[INFO 23-07-19 08:11:19.5855 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34142 node(s), and 113 input feature(s).
[INFO 23-07-19 08:11:19.5855 UTC abstract_model.cc:1311] Engine "RandomForestOptPred" built
[INFO 23-07-19 08:11:19.5856 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
##### Fold 10
Use /tmp/tmpw1ghfeih as temporary training directory
Reading training dataset...
Training dataset read in 0:00:02.103139. Found 556 examples.
Training model...
Model trained in 0:00:00.790585
Compiling model...


[INFO 23-07-19 08:11:26.2310 UTC kernel.cc:1242] Loading model from path /tmp/tmpw1ghfeih/model/ with prefix c947d241b6e64bce
[INFO 23-07-19 08:11:26.3682 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 34820 node(s), and 113 input feature(s).
[INFO 23-07-19 08:11:26.3683 UTC kernel.cc:1074] Use fast generic engine


Model compiled.
Type 1
fold_1: acc: 0.0000 loss: 0.2010 balanced loss: 0.2942
fold_2: acc: 0.0000 loss: 0.2160 balanced loss: 0.3145
fold_3: acc: 0.0000 loss: 0.2781 balanced loss: 0.4360
fold_4: acc: 0.0000 loss: 0.2768 balanced loss: 0.3325
fold_5: acc: 0.0000 loss: 0.2335 balanced loss: 0.3191
fold_6: acc: 0.0000 loss: 0.2744 balanced loss: 0.3483
fold_7: acc: 0.0000 loss: 0.2641 balanced loss: 0.3973
fold_8: acc: 0.0000 loss: 0.2546 balanced loss: 0.3383
fold_9: acc: 0.0000 loss: 0.2580 balanced loss: 0.3867
fold_10: acc: 0.0000 loss: 0.2148 balanced loss: 0.2814

Average accuracy: 0.0000  Average loss: 0.2471 Average balanced loss: 0.3448


## Experiment with TabPFN

In [10]:
def balanced_logloss_np(y_true: np.array, y_pred: np.array) -> float:
    # y_true is prob that y is equals to 1, we assume that final probs would be P(class_1) = 1 - P(class_0)
    y_pred_1 = y_pred
    y_pred_0 = 1-y_pred

    log_y_pred_1 = np.reshape(np.log(y_pred_1),[-1,1])
    log_y_pred_0 = np.reshape(np.log(y_pred_0),[-1,1])

    y_1 = np.reshape(y_true,[1,-1])
    y_0 = (y_1-1)*(-1)

    logloss_1 = -np.dot(y_1,log_y_pred_1)[0][0]/np.sum(y_1)
    logloss_0 = -np.dot(y_0,log_y_pred_0)[0][0]/np.sum(y_0)

    av_logloss = (logloss_1+logloss_0)/2
    
    return av_logloss

from tabpfn import TabPFNClassifier
from sklearn.metrics import accuracy_score

In [11]:
def train_model_tabpfn_one(train: pd.DataFrame, submition: pd.DataFrame, features: list, label = "Class") -> (pd.DataFrame, dict, dict):

    # Create a dataframe of required size with zero values.
    submition_df = pd.DataFrame(data=np.zeros((len(submition.index),1)),index=submition.index)
    train_df_out = pd.DataFrame(data=np.zeros((len(train.index),1)),index=train.index)
    # Create an empty dictionary to store the models trained for each fold.
    models = {}
    metrics = {}

    # Select only feature columns for training.
    train_df = train[features+[label]]

    # Define & Train the model and metrics
    model = TabPFNClassifier(N_ensemble_configurations=64)
    model.fit(train_df[features],train_df[label])

    # Store the model
    models = model

    # Make predictions
    p_train = model.predict_proba(train_df[features])[:,1]
    p_sub = model.predict_proba(submition[features])[:,1]

    # Predict value for validation/Submition data
    submition_df[0] = p_sub.flatten() 
    train_df_out[0] = p_train.flatten() 
    
    # Evaluate and store the metrics in respective dicts
            
    # metrics['accuracy'] = accuracy_score(train_df[label].values, p_train)
    metrics['balanced_logloss'] = balanced_logloss_np(y_true=train_df[label].values,y_pred=p_train)
            
    return train_df_out,submition_df,models,metrics

In [12]:
# TabPFN
train_2, submition_2, model_2,metrics_2 = train_model_tabpfn_one(
    train=train_out,submition=test_out, features=[i for i in features if len(i) > 2])

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


# Submission

In [13]:
# Aggragate Subs
RF_m = pd.Series(bal_logloss_1).mean()
TabPFN_m = pd.Series(metrics_2).mean()

print('RF: metric ', RF_m)
print('TabPFN: metric ', TabPFN_m)

RF_w = 1-(RF_m/(RF_m+TabPFN_m))
TabPFN_w = 1-(TabPFN_m/(RF_m+TabPFN_m))

print('RF weight: ', RF_w)
print('TabPFN weight: ', TabPFN_w)

submition_total = pd.concat([submition_1.mean(axis=1).to_frame(),submition_2], axis=1)
submition_total.columns = ['RF','TabPFN']

submition_total['Ensemble'] = RF_w*submition_total['RF'] + TabPFN_w*submition_total['TabPFN']

# submition_total

submition_total = submition_total[['Ensemble']].copy()

submition_total.columns = ['class_1']

submition_total['class_0'] = 1 - submition_total['class_1']

RF: metric  0.34483136236667633
TabPFN: metric  0.05152794842815339
RF weight:  0.13000312349121568
TabPFN weight:  0.8699968765087843


In [14]:
submition_total.to_csv('/kaggle/working/submission.csv', index=True)