notebook based on 

SAMUEL https://www.kaggle.com/code/muelsamu/simple-tabpfn-approach-for-score-of-15-in-1-min

@nomuraryota https://www.kaggle.com/nomuraryota work

In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages

In [None]:
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from catboost import Pool, CatBoostClassifier
import xgboost
from tabpfn import TabPFNClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
BASE_DIR = '/kaggle/input/icr-identify-age-related-conditions'
train_file = f'{BASE_DIR}/train.csv'
greeks_file = f'{BASE_DIR}/greeks.csv'
test_file = f'{BASE_DIR}/test.csv'

In [None]:
train_df = pd.read_csv(train_file)
greeks_df = pd.read_csv(greeks_file)
test_df = pd.read_csv(test_file)

In [None]:
print(train_df.EJ.unique())

In [None]:
first_category = train_df.EJ.unique()[0]
train_df.EJ = train_df.EJ.eq(first_category).astype('int')
test_df.EJ = test_df.EJ.eq(first_category).astype('int')

In [None]:
from datetime import datetime
times = greeks_df.Epsilon.copy()
times[greeks_df.Epsilon != 'Unknown'] = greeks_df.Epsilon[greeks_df.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks_df.Epsilon == 'Unknown'] = np.nan

In [None]:
target_column = 'Class'
predictor_columns = [n for n in train_df.columns if n != target_column and n != 'Id']

In [None]:
class WeightedEnsemble(BaseEstimator):
    def __init__(self):
        self.classifiers = [xgboost.XGBClassifier(), TabPFNClassifier(N_ensemble_configurations=64, device='cuda:0')]
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    
    def fit(self, X, y):
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        X = self.imputer.fit_transform(X)
        for classifier in self.classifiers:
            classifier.fit(X, y)
    
    def predict_proba(self, X):
        X = self.imputer.transform(X)
        probabilities = np.stack([classifier.predict_proba(X) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1)


In [None]:
train_pred_and_time = pd.concat((train_df[predictor_columns], times), axis=1)
test_predictors = np.array(test_df[predictor_columns])
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

In [None]:
model = WeightedEnsemble()
model.fit(np.array(train_pred_and_time), np.array(greeks_df['Alpha']))

In [None]:
probabilities = model.predict_proba(test_pred_and_time)
assert (model.classes_[0] == 'A')
probabilities = np.concatenate((probabilities[:,:1], np.sum(probabilities[:,1:], 1, keepdims=True)), axis=1)

In [None]:
p0 = probabilities[:,:1]
p0[p0 > 0.86] = 1
p0[p0 < 0.14] = 0

In [None]:
submission = pd.DataFrame(test_df["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [None]:
submission_df = pd.read_csv('submission.csv')
submission_df