ref https://www.kaggle.com/code/aikhmelnytskyy/public-krni-pdi-with-two-additional-models

In [1]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages
Processing /kaggle/input/pip-packages-icr/pip-packages/tabpfn-0.1.9-py3-none-any.whl
Installing collected packages: tabpfn
Successfully installed tabpfn-0.1.9
[0m

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings
from tqdm.notebook import tqdm
from sklearn import feature_selection
from sklearn.metrics import log_loss

warnings.filterwarnings('ignore')

train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')
# test['EJ'] = test['EJ'].map({'A': 0, 'B': 1})
# Define balanced log loss function
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight=1/nc[y_true], eps=1e-15)

    
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']
x = train[predictor_columns]
y = train['Class']
from sklearn.model_selection import KFold as KF, GridSearchCV
cv_outer = KF(n_splits = 10, shuffle=True, random_state=42)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=42)
from sklearn.metrics import log_loss

class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                          
                           xgboost.XGBClassifier(objective="binary:logistic",learning_rate=0.06733232950390658,n_estimators=50000,subsample=0.8,colsample_bytree=0.6055755840633003,scale_pos_weight=1 / np.mean(train['Class']),max_depth=4),
                           
                           TabPFNClassifier(N_ensemble_configurations=24),
                          
                          TabPFNClassifier(N_ensemble_configurations=64)]
    
    def fit(self,X,y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        first_category = X.EJ.unique()[0]
        X.EJ = X.EJ.eq(first_category).astype('int')
        X = self.imputer.fit_transform(X)
#         X = normalize(X,axis=0)
        for classifier in self.classifiers:
            if classifier==self.classifiers[2] or classifier==self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
#         x = normalize(x,axis=0)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
    
from datetime import datetime
def training(model, x,y,y_meta):
    outer_results = list()
    best_loss = np.inf
    split = 0
    splits = 5
    models=[]
    for train_idx,val_idx in tqdm(cv_inner.split(x), total = splits):
        split+=1
        x_train, x_val = x.iloc[train_idx],x.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
        #model = Ensemble()        
        model.fit(x_train, y_train)
        models.append(model)
        y_pred = model.predict_proba(x_val)
        probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
        p0 = probabilities[:,:1]
        p0[p0 > 0.85] = 1
        p0[p0 < 0.15] = 0
        y_p = np.empty((y_pred.shape[0],))
        for i in range(y_pred.shape[0]):
            if p0[i]>=0.5:
                y_p[i]= False
            else :
                y_p[i]=True
        y_p = y_p.astype(int)
        loss = balanced_log_loss(y_val,y_p)

        if loss<best_loss:
            best_model = model
            best_loss = loss
            print('best_model_saved')
        outer_results.append(loss)
        print('>val_loss=%.5f, split = %.1f' % (loss,split))
    print('LOSS: %.5f' % (np.mean(outer_results)))
    return best_model, models
    
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan
# times = times.astype(float)

# mean_time = int(times.mean())
# times.fillna(mean_time, inplace=True)
# print(mean_time)


# times = greeks.Epsilon.copy()
# times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x, '%m/%d/%Y').toordinal())

# times[greeks.Epsilon == 'Unknown'] = np.nan
# times = times.astype(float)

# mean_time = int(times.mean())
# times.fillna(mean_time, inplace=True)
# print(mean_time)

train_pred_and_time = pd.concat((train, times), axis=1)
test_predictors = test[predictor_columns]
first_category = test_predictors.EJ.unique()[0]
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')

test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

ros = RandomOverSampler(random_state=42)

train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks.Alpha)
print('Original dataset shape')
print(greeks.Alpha.value_counts())
print('Resample dataset shape')
print( y_ros.value_counts())

x_ros = train_ros.drop(['Class', 'Id'],axis=1)
y_ = train_ros.Class

yt = Ensemble()

Original dataset shape
A    509
B     61
G     29
D     18
Name: Alpha, dtype: int64
Resample dataset shape
B    509
A    509
D    509
G    509
Name: Alpha, dtype: int64
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [3]:
%%time

m,models = training(yt,x_ros,y_,y_ros)

  0%|          | 0/5 [00:00<?, ?it/s]

Parameters: { "scale_pos_weight" } are not used.

best_model_saved
>val_loss=0.00000, split = 1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.EJ = X.EJ.eq(first_category).astype('int')


Parameters: { "scale_pos_weight" } are not used.

>val_loss=0.00000, split = 2.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.EJ = X.EJ.eq(first_category).astype('int')


Parameters: { "scale_pos_weight" } are not used.

best_model_saved
>val_loss=0.00000, split = 3.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.EJ = X.EJ.eq(first_category).astype('int')


Parameters: { "scale_pos_weight" } are not used.

>val_loss=0.19189, split = 4.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.EJ = X.EJ.eq(first_category).astype('int')


Parameters: { "scale_pos_weight" } are not used.

>val_loss=0.20081, split = 5.0
LOSS: 0.07854
CPU times: user 1h 5min 58s, sys: 9min 48s, total: 1h 15min 47s
Wall time: 30min 10s


In [4]:
test_pred_and_time.shape == (5, 57)

True

In [5]:
# y_pred = m.predict_proba(test_pred_and_time)

# probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
# p0 = probabilities[:,:1]
# p0[p0 > 0.6] = 1
# p0[p0 < 0.25] = 0
# p0

In [6]:
y_pred = m.predict_proba(test_pred_and_time)

y_pred



array([[0.49999999, 0.24882973, 0.13599485, 0.11517542],
       [0.49999999, 0.24882973, 0.13599485, 0.11517542],
       [0.49999999, 0.24882973, 0.13599485, 0.11517542],
       [0.49999999, 0.24882973, 0.13599485, 0.11517542],
       [0.49999999, 0.24882973, 0.13599485, 0.11517542]])

In [7]:

# # Calculate the initial positive ratio
# positive_ratio = np.mean(y_pred[:, 0] > 0.5)
# print(positive_ratio)
# # Initialize the desired ratio and maximum desired ratio
# desired_ratio = 0.10
# max_desired_ratio = 0.15

# # Keep adjusting the threshold until the positive ratio meets the desired ratio
# threshold = 0.5
# while positive_ratio < desired_ratio:
#     if(threshold>0.8):
#         threshold = 0.59
#         break
#     # Adjust the threshold
#     threshold += 0.01

#     # Recalculate the positive ratio
#     positive_ratio = np.mean(y_pred[:, 0] > threshold)

#     # Check if the positive ratio exceeds the maximum desired ratio
#     if positive_ratio > max_desired_ratio:
#         # Use the last threshold that doesn't exceed the maximum desired ratio
#         threshold -= 0.01
#         break

# # Apply thresholding to obtain final class predictions
# # p0 = np.where(y_pred[:, 0] > threshold, 1, 0)

# probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
# p0 = probabilities[:,:1]
# p0[p0 > threshold] = 1
# p0[p0 < 1-threshold] = 0

# print("Threshold:", threshold)
# print("Class 1 predictions:", np.mean(p0))
# print("Class 0 predictions:", np.mean(1 - p0))
# p0

In [8]:
probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:,:1]

# Initialize the desired ratio and maximum desired ratio
desired_ratio = 0.15

# Calculate the initial positive ratio
positive_ratio = np.mean(p0 > 0.5)
print(positive_ratio)

# Keep adjusting the threshold until the positive ratio meets the desired ratio
threshold = 0.5
while positive_ratio < desired_ratio:
    if threshold > 0.86:
        threshold = 0.59
        break
    
    # Adjust the threshold
    threshold += 0.01

    # Recalculate the positive ratio
    positive_ratio = np.mean(p0 > threshold)

# Apply thresholding to obtain final class predictions
p0_thresholded = np.where(p0 > threshold, 1, 0)

print("Threshold:", threshold)
print("Class 1 predictions:", np.mean(p0_thresholded))
print("Class 0 predictions:", np.mean(1 - p0_thresholded))


0.0
Threshold: 0.59
Class 1 predictions: 0.0
Class 0 predictions: 1.0


In [9]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [10]:
submission_df = pd.read_csv('submission.csv')
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [11]:
p0

array([[0.49999999],
       [0.49999999],
       [0.49999999],
       [0.49999999],
       [0.49999999]])