This notebook is based upon <a href="https://www.kaggle.com/code/vadimkamaev/icr-identify-age">icr-identify-age</a> by <a href="https://www.kaggle.com/vadimkamaev">Vadim Kamaev</a>

I just added cross validation and avoided leaking future information in the CV strategy

In [None]:
!mkdir oof
!mkdir models

In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages

In [None]:
#!pip install tabpfn

In [None]:
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from catboost import Pool, CatBoostClassifier
import xgboost
import torch
from tabpfn import TabPFNClassifier

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV
import pickle
import gc
import lightgbm as lgb
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# LOAD THE DATA

BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions'
# Import data directly as H2O frame
maindf = pd.read_csv(f'{BASE_DIR}/train.csv')
greeksdf = pd.read_csv(f'{BASE_DIR}/greeks.csv')
testdf = pd.read_csv(f'{BASE_DIR}/test.csv')

print(maindf.EJ.unique())
first_cat = maindf.EJ.unique()[0]
maindf.EJ = maindf.EJ.eq(first_cat).astype('int')
testdf.EJ = testdf.EJ.eq(first_cat).astype('int')

In [None]:
# Greeks contains time information that we can use, we just need to parse it to int / nan.

from datetime import date, datetime
times = greeksdf.Epsilon.copy()
times[greeksdf.Epsilon != 'Unknown'] = greeksdf.Epsilon[greeksdf.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeksdf.Epsilon == 'Unknown'] = np.nan

In [None]:
# Set predictor and target columns
target = 'Class'
predictors = [n for n in maindf.columns if n != target and n != 'Id']

In [None]:
lgb_params = {
#         'objective': 'binary', 
#         'metric': 'binary_logloss', 
        'boosting': 'goss',
        'learning_rate': 0.09110460114828077,
        'num_leaves': 8,
        'feature_fraction': 0.4989639912997521,
        'bagging_fraction': 0.54872439795985,
        'lambda_l1': 1.4522184914523175, 
        'lambda_l2': 1.7873553090132748e-08,
        'n_jobs': -1,
        'is_unbalance':True, 
        'verbose': -1,
        'seed': 42,
    }

In [None]:
class WeightedEns(BaseEstimator):
    def __init__(self):
        self.classifiers = [xgboost.XGBClassifier(), TabPFNClassifier(N_ensemble_configurations=64,device="cuda:0" if torch.cuda.is_available() else "cpu")]
        self.imp = SimpleImputer(missing_values=np.nan, strategy='median')
    
    def fit(self, X, y):
        cls, y = np.unique(y, return_inverse=True)
        self.classes_ = cls
        X = self.imp.fit_transform(X)
        for cl in self.classifiers:
            cl.fit(X,y)
    
    def predict_proba(self, X):
        X = self.imp.transform(X)
        ps = np.stack([cl.predict_proba(X) for cl in self.classifiers])
        p = np.mean(ps,axis=0)
        class_0_est_instances = p[:,0].sum()
        others_est_instances = p[:,1:].sum()
        # we reweight the probs, since the loss is also balanced like this
        # our models out of the box optimize CE
        # with these changes they optimize balanced CE
        new_p = p * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(p.shape[1])]])
        return new_p / np.sum(new_p,axis=1,keepdims=1)

In [None]:
pred_and_time = pd.concat((maindf, times), 1)

In [None]:
test_predictors = np.array(testdf[predictors])
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors),1)) + pred_and_time.Epsilon.max()+1),1)

In [None]:
predictors.append('Epsilon')

In [None]:
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    balanced_log_loss_score = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / (w0+w1)
    return balanced_log_loss_score

In [None]:
from pathlib import Path

class CFG:
    VER = 1
    AUTHOR = 'maverick'
    COMPETITION = 'icr-identify-age-related-conditions'
    DATA_PATH = Path('/kaggle/input/icr-identify-age-related-conditions')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['tpm']
    seed = 3407 #52
    n_folds = 10 #replaced 20
    target_col = 'Class'
    metric = 'balanced_log_loss'
    metric_maximize_flag = False
    num_boost_round = 50500
    early_stopping_round = 500
    verbose = 2000
    boosting_type = 'dart'
    

In [None]:
pred_and_time['Alpha'] = greeksdf['Alpha']

In [None]:
def tpm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame, features: list, categorical_features: list, threshold: bool):
    m = WeightedEns()
    m.fit(x_train, y_train)
    p = m.predict_proba(x_valid)
    assert (m.classes_[0] == 'A')
    p = np.concatenate((p[:,:1],np.sum(p[:,1:],1,keepdims=True)), 1)
    p0 = p[:,1]
    if threshold:
        p0[p0 > 0.8] = 1
        p0[p0 < 0.1] = 0
    return m, p0

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, categorical_features: list, threshold = False):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (train_index, valid_index) in enumerate(kfold.split(train_df, train_df[CFG.target_col])):
        print('-'*50)
        print(f'{method} training fold {fold + 1}')
        
        x_train = train_df[features].iloc[train_index]
        y_train = train_df['Alpha'].iloc[train_index]
        x_valid = train_df[features].iloc[valid_index]
        y_valid = train_df['Alpha'].iloc[valid_index]
        
        # Original notebook contains information from future, so doing epsilon thing here as well
        times = greeksdf.loc[train_index, 'Epsilon'].copy()
        times[greeksdf.Epsilon != 'Unknown'] = greeksdf.Epsilon[greeksdf.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
        times[greeksdf.Epsilon == 'Unknown'] = np.nan
        x_train = pd.concat((x_train.drop('Epsilon', axis=1), times), 1)
        x_valid = x_valid.drop('Epsilon', axis=1)
        x_valid = np.concatenate((x_valid, np.zeros((len(x_valid),1)) + x_train.Epsilon.max()+1),1)
        
#         x_train = x_train.drop('Epsilon', axis=1)
        
        
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, features, categorical_features)
        if method == 'tpm':
            model, valid_pred = tpm_training(x_train, y_train, x_valid, y_valid, features, categorical_features, threshold)
        
        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[valid_index] = valid_pred
        oof_fold[valid_index] = fold + 1
        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    # Compute out of folds metric
    score = balanced_log_loss(train_df[CFG.target_col], oof_predictions)
    print(f'{method} our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'Id': train_df['Id'], CFG.target_col: train_df[CFG.target_col], f'{method}_prediction': oof_predictions, 'fold': oof_fold})
    oof_df.to_csv(CFG.MODEL_DATA_PATH / f'oof_{method}_seed{CFG.seed}_ver{CFG.VER}.csv', index = False)

Without using threshold

In [None]:
gradient_boosting_model_cv_training('tpm', pred_and_time, predictors, ['EJ'], threshold = False)

Applying threshold

In [None]:
gradient_boosting_model_cv_training('tpm', pred_and_time, predictors, ['EJ'], threshold = True)

In [None]:
def tpm_inference(x_test: pd.DataFrame, method: str):
    test_pred = 0
    for fold in range(CFG.n_folds):
        model = pickle.load(open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_seed{CFG.seed}_ver{CFG.VER}.pkl', 'rb'))
        # Predict
        p = model.predict_proba(x_test)
        assert (model.classes_[0] == 'A')
        p = np.concatenate((p[:,:1],np.sum(p[:,1:],1,keepdims=True)), 1)
        p0 = p[:,:1]
        p0[p0 > 0.8] = 1
        p0[p0 < 0.1] = 0
        test_pred += p0
    return test_pred / CFG.n_folds

def gradient_boosting_model_inference(method: str, x_test: pd.DataFrame, features: list, categorical_features: list):
    if method == 'lightgbm':
        test_pred = lightgbm_inference(x_test)
    if method == 'xgboost':
        test_pred = xgboost_inference(x_test)
    if method == 'catboost':
        test_pred = catboost_inference(x_test)
    if method == 'tpm':
        test_pred = tpm_inference(x_test, method)
    return test_pred

In [None]:
submit=pd.DataFrame(testdf["Id"], columns=["Id"])

In [None]:
for method in CFG.METHOD_LIST:
    p0 = gradient_boosting_model_inference(method, test_pred_and_time, predictors, ['EJ'])
    submit["class_0"] = p0
    submit["class_1"] = 1 - p0

In [None]:
submit

In [None]:
submit.to_csv('submission.csv',index=False)