In [1]:
# UNCOMMENT first with internet option turned on
# Use GPU env

# !pip download tabpfn --no-deps -d pip-packages

# from tabpfn import TabPFNClassifier
# TabPFNClassifier(N_ensemble_configurations = 64, device = 'cuda:0')

# !mv /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/prior_diff_real_checkpoint_n_0_epoch_100.cpkt pip-packages/
# !zip -r pip-packages.zip pip-packages

# now you need to download the zip and upload it as dataset with the plus in the top left
# then you need to add it to the notebook as data on the right, and name it `pip-packages-icr`

# now you can turn internet off and still install, like below

In [2]:
# !pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages

In [3]:
# !mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyprojroot import here

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer

from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier
from tabpfn import TabPFNClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE, RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

### Data Pull

In [68]:
# load data
local_dir = str(here()) + '/'
kaggle_dir = '/kaggle/input/'
train_df = pd.read_csv(local_dir + 'icr-identify-age-related-conditions/train.csv')
test_df = pd.read_csv(local_dir + 'icr-identify-age-related-conditions/test.csv')
greeks_df = pd.read_csv(local_dir + 'icr-identify-age-related-conditions/greeks.csv')

# join greeks and add Epsilon
train_df = pd.merge(train_df, greeks_df, on = 'Id')
train_df = train_df.drop(['Id', 'Beta', 'Gamma', 'Delta'], axis = 1)
train_df['Epsilon'] = train_df['Epsilon'].replace('Unknown', np.nan)
train_df = train_df[train_df['Epsilon'].notna()]
train_df['Epsilon'] = pd.to_datetime(train_df['Epsilon'])

# change epsilon to days since 1-1-2019 when data started to pick up
train_df['Days Since 1-1-2019'] = (train_df['Epsilon'] - pd.to_datetime('2019-01-01')).dt.days
train_df = train_df.drop('Epsilon', axis = 1)
train_df.reset_index(drop = True, inplace = True)

In [69]:
# plot styles
font_dict_header = {'size': 20, 'weight': 'bold'}
font_dict_axistitle = {'size': 14, 'weight': 'bold'}

# Data Pre-Processing

In [None]:
# set random seed
random_seed = 101010
np.random.seed(random_seed)

# create x and y train
X_train = train_df.drop(['Id', 'Alpha', 'Class'], axis = 1, inplace = False)
y_train_class = train_df['Class']
y_train_alpha = train_df['Alpha']
alpha_encoder = LabelEncoder()
y_train_alpha = alpha_encoder.fit_transform(y_train_alpha)

# clean categorical data
X_train['EJ'].replace({'A': 0, 'B': 1}, inplace = True)

# scale and impute data
X_train_columns = X_train.columns
X_train_index = X_train.index

standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)

knn_imputer = KNNImputer()
X_train = knn_imputer.fit_transform(X_train)

X_train = pd.DataFrame(X_train, columns = X_train_columns, index = X_train_index)

# Alpha Prediction Features

### Model Prediction Features
KNN, Extra Trees, and TabPFN

In [70]:
def fit_model(model_name, model, X_train, y_train, X_test, features):
    X = X_train[features].copy()
    y = y_train.copy()
    test = X_test[features].copy()
    
    model.fit(X, y)
    model_predictions = model.predict_proba(test)
    predictions_df = pd.DataFrame(model_predictions, columns = model.classes_, index=X_test.index)
    predictions_df.columns = [model_name + '_' + str(col) for col in predictions_df.columns]
    
    return predictions_df

In [72]:
skf = StratifiedKFold(n_splits = 5, random_state = random_seed, shuffle = True)
model_prediction_features_df = pd.DataFrame({})
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_alpha)):
    fold, oof = X_train.loc[train_index], X_train.loc[test_index]
    fold_y, oof_y = y_train_alpha[train_index], y_train_alpha[test_index]
    
    model_features = [
        'DU', 'CR', 'AB', 'DA', 'DH', 'BC', 'FR', 'EP', 'DI', 'FL', 'EU', 'EH', 'Days Since 1-1-2019'
    ]
    
    # KNN
    knn_features = model_features[0:3]
    knn_features.append(model_features[-1])
    knn = KNeighborsClassifier(n_neighbors = 7)
    knn_predictions = fit_model('KNN 7', knn, fold, fold_y, oof, knn_features)
    
    # Extra Trees
    extra_trees = ExtraTreesClassifier(n_estimators = 250, random_state = random_seed)
    extra_trees_predictions = fit_model('Extra Trees', extra_trees, fold, fold_y, oof, model_features)
    
    # TabPFN
    tabpfn = TabPFNClassifier(N_ensemble_configurations = 64, seed = random_seed)
    tabpfn_predictions = fit_model('TabPFN', tabpfn, fold, fold_y, oof, model_features)
    
    predictions_df = pd.concat([knn_predictions, extra_trees_predictions, tabpfn_predictions], axis = 1)
    model_prediction_features_df = pd.concat([model_prediction_features_df, predictions_df])
    
X_train = pd.concat([X_train, model_prediction_features_df], axis = 1)
for pred_class in y_train_alpha:
    X_train['Alpha_' + str(pred_class)] = X_train['Extra Trees_' + str(pred_class)] + X_train['TabPFN_' + str(pred_class)]

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [73]:
pre_fit_models = {
    'KNN 7': knn.fit(X_train[knn_features], y_train_alpha),
    'Extra Trees': extra_trees.fit(X_train[model_features], y_train_alpha),
    'TabPFN': tabpfn.fit(X_train[model_features], y_train_alpha)
}

X_train = pd.concat([X_train, model_prediction_features_df], axis = 1)

In [74]:
for pred_class in y_train_alpha:
    X_train['Alpha_' + str(pred_class)] = (X_train['Extra Trees_' + str(pred_class)] + X_train['TabPFN_' + str(pred_class)]) / 2

In [75]:
print('Shape of X:', X_train.shape)
print('Shape of y:', y_train_alpha.shape)

Shape of X: (473, 75)
Shape of y: (473,)


In [None]:
# print('Shape of X:', X_test.shape)
# print('Shape of y:', y_test.shape)

# Model Pipeline

In [77]:
def competition_log_loss(y_true, y_pred):
    # calculate the predictin probability, clip it to avoid log(0) and calculate the log loss
    proba_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    proba_0 = 1 - proba_1
    
    # count each class
    class_0 = np.sum(1 - y_true)
    class_1 = np.sum(y_true)

    # log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(proba_0)) / class_0
    log_loss_1 = -np.sum(y_true * np.log(proba_1)) / class_1
    
    # return average log loss
    return (log_loss_0 + log_loss_1)/2

# make scorer for sklearn GridSearchCV
balanced_log_loss_scorer = make_scorer(competition_log_loss, greater_is_better = False, needs_proba = True)

In [78]:
X_train['Class'] = y_train_class
X_train['Alpha'] = y_train_alpha

In [85]:
y_true = []
y_pred = []
log_loss = []
skf = StratifiedKFold(n_splits = 5, random_state = random_seed, shuffle = True)
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train_class)):
    fold, oof = X_train.loc[train_index], X_train.loc[test_index]
    fold_y_class, oof_y_class = y_train_class[train_index], y_train_class[test_index]
    fold_y_alpha, oof_y_alpha = y_train_alpha[train_index], y_train_alpha[test_index]
    
    # over sample
    over_sampler = RandomOverSampler(random_state = random_seed)
    fold, fold_y_class = over_sampler.fit_resample(fold, fold_y_class)
    fold_y_alpha = fold['Alpha']
    fold.drop(['Alpha', 'Class'], axis = 1, inplace = True)
    oof.drop(['Alpha', 'Class'], axis = 1, inplace = True)
    
    # pre-process fold data
    fold['EJ'].replace({'A': 0, 'B': 1}, inplace = True)
    fold_columns = fold.columns
    fold_index = fold.index
    standard_scaler = StandardScaler()
    fold = standard_scaler.fit_transform(fold)
    knn_imputer = KNNImputer()
    fold = knn_imputer.fit_transform(fold)
    fold = pd.DataFrame(fold, columns = fold_columns, index = fold_index)
    
    # pre-process oof data
    oof['EJ'].replace({'A': 0, 'B': 1}, inplace = True)
    oof_columns = oof.columns
    oof_index = oof.index
    oof = standard_scaler.transform(oof)
    oof = knn_imputer.transform(oof)
    oof = pd.DataFrame(oof, columns = oof_columns, index = oof_index)
    
    # train model
    xgb = XGBClassifier(n_estimators = 250, random_state = random_seed)
    # catboost = CatBoostClassifier(iterations = 250, random_state = random_seed, verbose = False)
    # xtree = ExtraTreesClassifier(n_estimators = 250, random_state = random_seed)
    
    xgb.fit(fold, fold_y_alpha)
    # catboost.fit(fold, fold_y_alpha)
    # xtree.fit(fold, fold_y_alpha)
    
    # predict oof
    oof_y_alpha_proba = xgb.predict_proba(oof)
    # oof_y_alpha_proba += catboost.predict_proba(oof)
    # oof_y_alpha_proba += xtree.predict_proba(oof)
    # oof_y_alpha_proba /= 3
    
    class_0 = oof_y_alpha_proba[:, 0].sum()
    class_123 = oof_y_alpha_proba[:, 1:].sum()
    new_probabilities = oof_y_alpha_proba * np.array([[1/(class_0 if i == 0 else class_123) for i in range(oof_y_alpha_proba.shape[1])]])
    oof_y_alpha_proba =  new_probabilities / np.sum(new_probabilities, axis = 1, keepdims = 1)
    
    oof_y_class_proba = oof_y_alpha_proba[:, 1:].sum(axis = 1)
    oof_y_class_proba[oof_y_class_proba > 0.98] = 1
    oof_y_class_proba[oof_y_class_proba < 0.02] = 0
    
    fold_log_loss = competition_log_loss(oof_y_class, oof_y_class_proba)
    log_loss.append(fold_log_loss)
    
    print('Balanced Log Loss:', fold_log_loss)
    y_true.append(oof_y_class)
    y_pred.append(oof_y_class_proba)

Balanced Log Loss: 0.26829270112779136
Balanced Log Loss: 0.11474936790363675
Balanced Log Loss: 0.34167128556765897
Balanced Log Loss: 0.45097281300375397
Balanced Log Loss: 0.3645385067073838


In [90]:

y_pred[3].sort()
y_pred[3]

array([4.11495877e-04, 4.41868460e-04, 5.10996092e-04, 5.52139672e-04,
       5.60528545e-04, 6.28487787e-04, 6.41189418e-04, 6.71372669e-04,
       7.03905585e-04, 7.30860985e-04, 7.53511396e-04, 7.68150380e-04,
       7.91626273e-04, 8.62483973e-04, 8.91497675e-04, 9.38980140e-04,
       1.04054981e-03, 1.05760711e-03, 1.09932233e-03, 1.13049396e-03,
       1.22748564e-03, 1.22985057e-03, 1.27334774e-03, 1.29113501e-03,
       1.32040525e-03, 1.39237173e-03, 1.51814327e-03, 1.61556049e-03,
       1.66966832e-03, 1.68219130e-03, 1.69605384e-03, 1.69902179e-03,
       1.74199616e-03, 1.79859837e-03, 1.82554866e-03, 1.86909805e-03,
       1.87064989e-03, 1.90485472e-03, 1.90901066e-03, 1.96152785e-03,
       2.04275679e-03, 2.11596554e-03, 2.16922557e-03, 2.17306024e-03,
       2.19619452e-03, 2.32017323e-03, 2.42391971e-03, 2.55144566e-03,
       2.61690641e-03, 2.69713629e-03, 2.96355982e-03, 3.15215682e-03,
       3.28452825e-03, 3.37311389e-03, 3.40799947e-03, 4.00884845e-03,
      