# model 1 -> lightgbm v2 of https://www.kaggle.com/code/mobassir/icr-lightgbm-model-baseline?scriptVersionId=134835182

In [1]:
# Define balanced log loss function
from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight=1/nc[y_true], eps=1e-15)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier

# Define the number of bags and folds
bag_num = 5
n_fold = 10

# Define the feature selection method
k = 30  # Number of top features to select
selector = SelectKBest(f_classif, k=k)


# Define the competition log loss metric
def competition_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    return (log_loss_0 + log_loss_1) / 2

# Load the data
COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
train = pd.read_csv(f"{COMP_PATH}/train.csv")
test = pd.read_csv(f"{COMP_PATH}/test.csv")

# Perform label encoding
train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ'] = test['EJ'].map({'A': 0, 'B': 1})

# Prepare the data
df = train.copy()
test_df = test.copy()
feas_cols = [col for col in df.columns if col not in ['Id', 'Class']]

# Define the imputer
imputer = SimpleImputer(strategy='mean')

# Apply mean imputation on the data
df[feas_cols] = imputer.fit_transform(df[feas_cols])
test_df[feas_cols] = imputer.transform(test_df[feas_cols])

# Define the LGBM parameters
lgbm_params = {
    "boosting_type": 'goss',
    "learning_rate": 0.06733232950390658,
    "n_estimators": 50000,
    "early_stopping_round": 300,
    "random_state": 118,
    "subsample": 0.8,
    "colsample_bytree": 0.6055755840633003,
    "class_weight": 'balanced',
    "metric": 'logloss',
    "is_unbalance": True,
    "max_depth": 12
}

# Initialize lists to store models and log losses
models = []
bag_log_losses = []
feature_importance_df_total = pd.DataFrame()

# Perform bagging and feature selection
for bag in range(bag_num):
    print(f'########################## bag: {bag} ##########################')
    kf = StratifiedKFold(n_splits=n_fold, random_state=118*bag, shuffle=True)
    fold_losses = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(df, df['Class'])):
        train_df = df.iloc[train_idx]
        valid_df = df.iloc[test_idx]
        valid_ids = valid_df.Id.values.tolist()

        X_train, y_train = train_df[feas_cols], train_df['Class']
        X_valid, y_valid = valid_df[feas_cols], valid_df['Class']

        # Perform feature selection
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_valid_selected = selector.transform(X_valid)
        test_df_selected = selector.transform(test_df[feas_cols])

        # Update feature columns
        feas_cols_selected = [feas_cols[i] for i in selector.get_support(indices=True)]
        feas_cols = feas_cols_selected

        lgb = LGBMClassifier(**lgbm_params)
        lgb.fit(X_train_selected, y_train, eval_set=(X_valid_selected, y_valid), verbose=False,
                eval_metric=lambda y_true, y_pred: ('logloss', competition_log_loss(y_true, y_pred), False))

        models.append(lgb)

        # Calculate feature importances
        feature_importances = lgb.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': feas_cols, 'Importance': feature_importances})
        feature_importance_df['Bag'] = bag
        feature_importance_df['Fold'] = fold
        feature_importance_df_total = pd.concat([feature_importance_df_total, feature_importance_df], axis=0)

        y_pred = lgb.predict_proba(X_valid_selected)
        fold_loss = log_loss(y_valid, y_pred)
        fold_losses.append(fold_loss)
        
        print(f"Total train: {len(train_df)}, Total valid: {len(valid_df)}, Bags: {bag}, Fold: {fold}, Log Loss: {fold_loss:.4f}")
    
    avg_fold_loss = np.mean(fold_losses)
    bag_log_losses.append(avg_fold_loss)
    print(f"Average Log Loss for Bag {bag}: {avg_fold_loss:.4f}")

avg_loss = np.mean(bag_log_losses)
print(f"Average Log Loss after Full Training: {avg_loss:.4f}")

# Calculate weights based on inverse of log loss
weights = [1 / loss for loss in bag_log_losses]
total_weight = sum(weights)
weights = [weight / total_weight for weight in weights]

# Prepare submission dataframe
lgbm_preds = np.zeros(len(test_df))
for bag, weight in zip(range(bag_num), weights):
    for fold in range(n_fold):
        clf = models[bag * n_fold + fold]
        lgbm_preds += weight * clf.predict_proba(test_df_selected)[:, 1] / n_fold

lgbm = test_df[['Id']].copy()
lgbm['Class_0'] = 1 - lgbm_preds
lgbm['Class_1'] = lgbm_preds
lgbm.to_csv('lgbm0.16.csv', index=False)
lgbm.head()


########################## bag: 0 ##########################




Total train: 555, Total valid: 62, Bags: 0, Fold: 0, Log Loss: 0.3310




Total train: 555, Total valid: 62, Bags: 0, Fold: 1, Log Loss: 0.0966




Total train: 555, Total valid: 62, Bags: 0, Fold: 2, Log Loss: 0.1476




Total train: 555, Total valid: 62, Bags: 0, Fold: 3, Log Loss: 0.2905




Total train: 555, Total valid: 62, Bags: 0, Fold: 4, Log Loss: 0.1963




Total train: 555, Total valid: 62, Bags: 0, Fold: 5, Log Loss: 0.2553




Total train: 555, Total valid: 62, Bags: 0, Fold: 6, Log Loss: 0.2425




Total train: 556, Total valid: 61, Bags: 0, Fold: 7, Log Loss: 0.1347




Total train: 556, Total valid: 61, Bags: 0, Fold: 8, Log Loss: 0.4505




Total train: 556, Total valid: 61, Bags: 0, Fold: 9, Log Loss: 0.2076
Average Log Loss for Bag 0: 0.2353
########################## bag: 1 ##########################




Total train: 555, Total valid: 62, Bags: 1, Fold: 0, Log Loss: 0.4915




Total train: 555, Total valid: 62, Bags: 1, Fold: 1, Log Loss: 0.1780




Total train: 555, Total valid: 62, Bags: 1, Fold: 2, Log Loss: 0.1840




Total train: 555, Total valid: 62, Bags: 1, Fold: 3, Log Loss: 0.1497




Total train: 555, Total valid: 62, Bags: 1, Fold: 4, Log Loss: 0.2314




Total train: 555, Total valid: 62, Bags: 1, Fold: 5, Log Loss: 0.1831




Total train: 555, Total valid: 62, Bags: 1, Fold: 6, Log Loss: 0.2031




Total train: 556, Total valid: 61, Bags: 1, Fold: 7, Log Loss: 0.1907




Total train: 556, Total valid: 61, Bags: 1, Fold: 8, Log Loss: 0.1532




Total train: 556, Total valid: 61, Bags: 1, Fold: 9, Log Loss: 0.1800
Average Log Loss for Bag 1: 0.2145
########################## bag: 2 ##########################




Total train: 555, Total valid: 62, Bags: 2, Fold: 0, Log Loss: 0.2073




Total train: 555, Total valid: 62, Bags: 2, Fold: 1, Log Loss: 0.1449




Total train: 555, Total valid: 62, Bags: 2, Fold: 2, Log Loss: 0.1889




Total train: 555, Total valid: 62, Bags: 2, Fold: 3, Log Loss: 0.1798




Total train: 555, Total valid: 62, Bags: 2, Fold: 4, Log Loss: 0.2819




Total train: 555, Total valid: 62, Bags: 2, Fold: 5, Log Loss: 0.4101




Total train: 555, Total valid: 62, Bags: 2, Fold: 6, Log Loss: 0.4122




Total train: 556, Total valid: 61, Bags: 2, Fold: 7, Log Loss: 0.2325




Total train: 556, Total valid: 61, Bags: 2, Fold: 8, Log Loss: 0.1047




Total train: 556, Total valid: 61, Bags: 2, Fold: 9, Log Loss: 0.3961
Average Log Loss for Bag 2: 0.2558
########################## bag: 3 ##########################




Total train: 555, Total valid: 62, Bags: 3, Fold: 0, Log Loss: 0.1816




Total train: 555, Total valid: 62, Bags: 3, Fold: 1, Log Loss: 0.3556




Total train: 555, Total valid: 62, Bags: 3, Fold: 2, Log Loss: 0.2346




Total train: 555, Total valid: 62, Bags: 3, Fold: 3, Log Loss: 0.2189




Total train: 555, Total valid: 62, Bags: 3, Fold: 4, Log Loss: 0.1535




Total train: 555, Total valid: 62, Bags: 3, Fold: 5, Log Loss: 0.1841




Total train: 555, Total valid: 62, Bags: 3, Fold: 6, Log Loss: 0.1324




Total train: 556, Total valid: 61, Bags: 3, Fold: 7, Log Loss: 0.2666




Total train: 556, Total valid: 61, Bags: 3, Fold: 8, Log Loss: 0.4547




Total train: 556, Total valid: 61, Bags: 3, Fold: 9, Log Loss: 0.1330
Average Log Loss for Bag 3: 0.2315
########################## bag: 4 ##########################




Total train: 555, Total valid: 62, Bags: 4, Fold: 0, Log Loss: 0.1032




Total train: 555, Total valid: 62, Bags: 4, Fold: 1, Log Loss: 0.1277




Total train: 555, Total valid: 62, Bags: 4, Fold: 2, Log Loss: 0.3344




Total train: 555, Total valid: 62, Bags: 4, Fold: 3, Log Loss: 0.0839




Total train: 555, Total valid: 62, Bags: 4, Fold: 4, Log Loss: 0.2082




Total train: 555, Total valid: 62, Bags: 4, Fold: 5, Log Loss: 0.2133




Total train: 555, Total valid: 62, Bags: 4, Fold: 6, Log Loss: 0.4430




Total train: 556, Total valid: 61, Bags: 4, Fold: 7, Log Loss: 0.1259




Total train: 556, Total valid: 61, Bags: 4, Fold: 8, Log Loss: 0.2431




Total train: 556, Total valid: 61, Bags: 4, Fold: 9, Log Loss: 0.3431
Average Log Loss for Bag 4: 0.2226
Average Log Loss after Full Training: 0.2319


Unnamed: 0,Id,Class_0,Class_1
0,00eed32682bb,0.638554,0.361446
1,010ebe33f668,0.638554,0.361446
2,02fa521e1838,0.638554,0.361446
3,040e15f562a2,0.638554,0.361446
4,046e85c7cc7f,0.638554,0.361446


# model 2 -> Stacked XGB Models --> v13 https://www.kaggle.com/code/zhukovoleksiy/icr-stacking-xgb-models?scriptVersionId=136494925

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from copy import deepcopy
from functools import partial
import random
import gc
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# Import sklearn classes for model selection, cross validation, and performance evaluation
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
from category_encoders import OneHotEncoder, OrdinalEncoder, CountEncoder
from imblearn.under_sampling import RandomUnderSampler

# Import libraries for Hypertuning
import optuna

# Import libraries for gradient boosting
import xgboost as xgb
import lightgbm as lgb
import xgboost as xgb
import lightgbm as lgb 
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import NuSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from catboost import CatBoost, CatBoostRegressor, CatBoostClassifier
from catboost import Pool

from sklearn.feature_selection import RFE, RFECV
from sklearn.inspection import permutation_importance

# Useful line of code to set the display option so we could see all the columns in pd dataframe
pd.set_option('display.max_columns', None)

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
filepath = '/kaggle/input/icr-identify-age-related-conditions/'
df_train = pd.read_csv(os.path.join(filepath, 'train.csv'), index_col=[0])
df_test = pd.read_csv(os.path.join(filepath, 'test.csv'), index_col=[0])
greeks = pd.read_csv(f'{filepath}greeks.csv')

target_col = 'Class'

df_train['EJ'].replace({'A':0, 'B':1}, inplace=True)
df_test['EJ'].replace({'A':0, 'B':1}, inplace=True)

# df_train = df_train.drop(['EJ'], axis=1)
# df_test = df_test.drop(['EJ'], axis=1)

df_train = df_train.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})
df_test = df_test.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})

print(f'df_train shape: {df_train.shape}\n')
print(f'df_test shape: {df_test.shape}\n')
# Only include numerical features
df_train_numerical = df_train.drop(['Class'], axis=1)
# Initialize the KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=5)

# Perform KNN imputation
df_train_imputed = pd.DataFrame(imputer.fit_transform(df_train[df_train_numerical.columns]), columns=df_train_numerical.columns)
df_test_imputed =pd.DataFrame(imputer.transform(df_test[df_train_numerical.columns]), columns=df_train_numerical.columns)

# Check if there are still missing values in the train and test data sets
df_train_null = df_train_imputed[df_train_imputed.isnull().any(axis=1)]
df_test_null = df_test_imputed[df_test_imputed.isnull().any(axis=1)]

# Display the rows with null values
print('No. of records with missing value in Train data set after Imputation : {}'.format(df_train_null.shape[0]))
print('No. of records with missing value in Test data set after Imputation : {}'.format(df_test_null.shape[0]))

print('=' * 50)

# Replace the imputed columns in the train data sets
df_train_2 = df_train.drop(df_train_numerical.columns, axis=1).reset_index()
df_train_2 = pd.concat([df_train_2, df_train_imputed], axis=1)

# Replace the imputed columns in the test data sets
df_test_2 = df_test.drop(df_train_numerical.columns, axis=1).reset_index()
df_test_2 = pd.concat([df_test_2, df_test_imputed], axis=1)

X_train = df_train_2.drop([f'{target_col}', 'Id'],axis=1).reset_index(drop=True)
y_train = df_train_2[f'{target_col}'].reset_index(drop=True)
X_test = df_test_2.drop(['Id'],axis=1).reset_index(drop=True)

# Check the shape of the train and test data set 
print('Shape of the Train data set : {}'.format(X_train.shape))
print('Shape of the Test data set : {}'.format(X_test.shape))

numeric_columns = [_ for _ in X_train.columns if _ not in ['EJ']]
log_cols = [_ for _ in X_train.columns if _ not in ['EJ', 'BN', 'CW', 'EL', 'GL']]
X_train.loc[:, log_cols] = np.log1p(X_train.loc[:, log_cols])
X_test.loc[:, log_cols] = np.log1p(X_test.loc[:, log_cols])
sc = StandardScaler() # MinMaxScaler or StandardScaler
X_train[numeric_columns] = sc.fit_transform(X_train[numeric_columns])
X_test[numeric_columns] = sc.transform(X_test[numeric_columns])

print(f"X_train shape :{X_train.shape} , y_train shape :{y_train.shape}")
print(f"X_test shape :{X_test.shape}")

# Delete the train and test dataframes to free up memory
del df_train, df_test, df_train_imputed, df_train_2, df_test_2, df_train_null, df_test_null

scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

class_weight_0 = 1.0
class_weight_1 = 1.0 / scale_pos_weight

class_weights_cat = [class_weight_0, class_weight_1]

class_weights_lgb = {0: class_weight_0, 1: class_weight_1}
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1


class Splitter:
    def __init__(self, kfold=True, n_splits=5, greeks=pd.DataFrame()):
        self.n_splits = n_splits
        self.kfold = kfold
        self.greeks = greeks

    def split_data(self, X, y, random_state_list):
        if self.kfold == 'skf':
            for random_state in random_state_list:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    if type(X) is np.ndarray:
                        X_train, X_val = X[train_index], X[val_index]
                        y_train, y_val = y[train_index], y[val_index]
                    else:
                        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        else:
            raise ValueError(f"Invalid kfold: Must be True")
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=42):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        
        xgb_optuna1 = {
            'n_estimators': 900,
            'learning_rate': 0.09641232707445854,
            'booster': 'gbtree',
            'lambda': 4.666002223704784,
            'alpha': 3.708175990751336,
            'subsample': 0.6100174145229473,
            'colsample_bytree': 0.5506821152321051,
            'max_depth': 7,
            'min_child_weight': 3,
            'eta': 1.740374368661041,
            'gamma': 0.007427363662926455,
            'grow_policy': 'depthwise',
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'verbosity': 0,
            'random_state': self.random_state,
            'scale_pos_weight': scale_pos_weight
        }
        
        xgb_optuna2 = {
            'n_estimators': 650,
            'learning_rate': 0.012208383405206188,
            'booster': 'gbtree',
            'lambda': 0.009968756668882757,
            'alpha': 0.02666266827121168,
            'subsample': 0.7097814108897231,
            'colsample_bytree': 0.7946945784285216,
            'max_depth': 3,
            'min_child_weight': 4,
            'eta': 0.5480204506554545,
            'gamma': 0.8788654128774149,
            'scale_pos_weight': 4.71,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'verbosity': 0,
            'random_state': self.random_state,
            'scale_pos_weight': scale_pos_weight
        }

        xgb_params = {
            'n_estimators': self.n_estimators,
            'learning_rate': 0.413327571405248,
            'booster': 'gbtree',
            'lambda': 0.0000263894617720096,
            'alpha': 0.000463768723479341,
            'subsample': 0.237467672874133,
            'colsample_bytree': 0.618829300507829,
            'max_depth': 5,
            'min_child_weight': 9,
            'eta': 2.09477807126539E-06,
            'gamma': 0.000847289463422307,
            'grow_policy': 'depthwise',
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'scale_pos_weight': scale_pos_weight,
            'verbosity': 0,
            'random_state': self.random_state,
            
        }
        
        xgb_params2 = {
            'colsample_bytree': 0.5646751146007976,
            'gamma': 7.788727238356553e-06,
            'learning_rate': 0.1419865761603358,
            'max_bin': 824,
            'min_child_weight': 1,
            'random_state': 811996,
            'reg_alpha': 1.6259583347890365e-07,
            'reg_lambda': 2.110691851528507e-08,
            'subsample': 0.879020578464637,
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'max_depth': 3,
            'n_jobs': -1,
            'verbosity': 0,
            'random_state': self.random_state,
            'scale_pos_weight': scale_pos_weight
        }
        
        xgb_params3 = {
            'random_state': self.random_state,
            'colsample_bytree': 0.4836462317215041,
            'eta': 0.05976752607337169,
            'gamma': 1,
            'lambda': 0.2976432557733288,
            'max_depth': 6,
            'min_child_weight': 1,
            'n_estimators': 550,
            'objective': 'binary:logistic',
            'scale_pos_weight': 4.260162886376033,
            'subsample': 0.7119282378433924,
        }
        
        xgb_params4 = {
            'colsample_bytree': 0.8757972257439255,
            'gamma': 0.11135738771999848,
            'max_depth': 7,
            'min_child_weight': 3,
            'reg_alpha': 0.4833998914998038,
            'reg_lambda': 0.006223568555619563,
            'scale_pos_weight': 8,
            'subsample': 0.7056434340275685,
            'random_state': self.random_state
        }
        
        xgb_params5 = {
            'max_depth': 5, 
            'min_child_weight': 2.934487833919741,
            'learning_rate': 0.11341944575807082, 
            'subsample': 0.9045063514419968,
            'gamma': 0.4329153382843715,
            'colsample_bytree': 0.38872702868412506,
            'colsample_bylevel': 0.8321880031718571,
            'colsample_bynode': 0.802355707802605,
            'random_state': self.random_state
       }
        
        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
       
        models = {
            'xgb01': xgb.XGBClassifier(**xgb_optuna1),
            'xgb02': xgb.XGBClassifier(**xgb_optuna2),
            'xgb1': xgb.XGBClassifier(**xgb_params),
            'xgb2': xgb.XGBClassifier(**xgb_params2),
            'xgb3': xgb.XGBClassifier(**xgb_params3),
            #'xgb4': xgb.XGBClassifier(**xgb_params4),
            'xgb5': xgb.XGBClassifier(**xgb_params5),
            #add some models with default params to "simplify" ensemble
            'svc': SVC(random_state=self.random_state, probability=True),
            'brf': BalancedRandomForestClassifier(random_state=self.random_state),
            #'lr': LogisticRegression(random_state=self.random_state)
        }
        
        return models

class OptunaWeights:
    def __init__(self, random_state, n_trials=1000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 1e-14, 1) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)

        # Calculate the score for the weighted prediction
        # score = log_loss(y_true, weighted_pred)
        score = balanced_log_loss(y_true, weighted_pred)
        return score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='minimize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights



df_train shape: (617, 57)

df_test shape: (5, 56)

No. of records with missing value in Train data set after Imputation : 0
No. of records with missing value in Test data set after Imputation : 0
Shape of the Train data set : (617, 56)
Shape of the Test data set : (5, 56)
X_train shape :(617, 56) , y_train shape :(617,)
X_test shape :(5, 56)


In [4]:
%%time

kfold = 'skf'
n_splits = 5
n_reapts = 5
random_state = 42
n_estimators = 99999
early_stopping_rounds = 99
verbose = False
device = 'cpu'

# Fix seed
random.seed(random_state)
random_state_list = random.sample(range(9999), n_reapts)
#random_state_list = [42]

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators, device, random_state)
test_predss = np.zeros((X_test.shape[0]))
oof_predss = np.zeros((X_train.shape[0], n_reapts))
ensemble_score, ensemble_score_ = [], []
weights = []
oof_each_predss = []
oof_each_preds = np.zeros((X_train.shape[0], classifier.len_models))
test_each_predss = []
test_each_preds = np.zeros((X_test.shape[0], classifier.len_models))
trained_models = {'xgb':[], 'cat':[]}
score_dict = dict(zip(classifier.models_name, [[] for _ in range(classifier.len_models)]))

splitter = Splitter(kfold=kfold, n_splits=n_splits, greeks=greeks.iloc[:,1:-1])
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits
            
    # Get a set of classifier models
    classifier = Classifier(n_estimators, device, random_state_list[m])
    models = classifier.models
    
    # Initialize lists to store oof and test predictions for each base model
    oof_preds = []
    test_preds = []
    
    # Loop over each base model and fit it to the training data, evaluate on validation data, and store predictions
    for name, model in models.items():
        if ('xgb' in name) or ('lgb' in name) or ('cat' in name):
            train_w0, train_w1 = calc_log_loss_weight(y_train_)
            valid_w0, valid_w1 = calc_log_loss_weight(y_val)
            if 'xgb' in name:
                model.fit(
                    X_train_, y_train_, sample_weight=y_train_.map({0: train_w0, 1: train_w1}), 
                    eval_set=[(X_val, y_val)], sample_weight_eval_set=[y_val.map({0: valid_w0, 1: valid_w1})],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'lgb' in name:
                model.fit(
                    X_train_, y_train_, sample_weight=y_train_.map({0: train_w0, 1: train_w1}), 
                    eval_set=[(X_val, y_val)], eval_sample_weight=[y_val.map({0: valid_w0, 1: valid_w1})],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'cat' in name:
                model.fit(
                    Pool(X_train_, y_train_, weight=y_train_.map({0: train_w0, 1: train_w1})), 
                    eval_set=Pool(X_val, y_val, weight=y_val.map({0: valid_w0, 1: valid_w1})), 
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
        else:
            model.fit(X_train_, y_train_)
            
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))
        
        test_pred = model.predict_proba(X_test)[:, 1].reshape(-1)
        y_val_pred = model.predict_proba(X_val)[:, 1].reshape(-1)
        
        # Calculate recall and precision scores
        y_val_pred_binary = (y_val_pred > 0.5).astype(int)
        recall = recall_score(y_val, y_val_pred_binary)
        precision = precision_score(y_val, y_val_pred_binary)
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] Recall score: {recall:.5f}')
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] Precision score: {precision:.5f}')

        score = balanced_log_loss(y_val, y_val_pred)
        score_dict[name].append(score)
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] BalancedLogLoss score: {score:.5f}')
        print('-'*50)
        
        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)
    
    # Use Optuna to find the best ensemble weights
    optweights = OptunaWeights(random_state=random_state_list[m])
    y_val_pred = optweights.fit_predict(y_val.values, oof_preds)
    
    score = balanced_log_loss(y_val, y_val_pred)
    score_ = roc_auc_score(y_val, y_val_pred)
    print(f'--> Ensemble [FOLD-{n} SEED-{random_state_list[m]}] BalancedLogLoss score {score:.5f}')
    print('='*50)
    ensemble_score.append(score)
    ensemble_score_.append(score_)
    weights.append(optweights.weights)
    
    # Predict to X_test by the best ensemble weights
    test_predss += optweights.predict(test_preds) / (n_splits * len(random_state_list))
    oof_predss[X_val.index, m] += optweights.predict(oof_preds)
    oof_each_preds[X_val.index] = np.stack(oof_preds).T
    test_each_preds += np.array(test_preds).T / n_splits
    if n == (n_splits - 1):
        oof_each_predss.append(oof_each_preds)
        oof_each_preds = np.zeros((X_train.shape[0], classifier.len_models))
        test_each_predss.append(test_each_preds)
        test_each_preds = np.zeros((X_test.shape[0], classifier.len_models))
    
    gc.collect()
    
oof_each_predss = np.mean(np.array(oof_each_predss), axis=0)
test_each_predss = np.mean(np.array(test_each_predss), axis=0)
oof_each_predss = np.concatenate([oof_each_predss, np.mean(oof_predss, axis=1).reshape(-1, 1)], axis=1)
test_each_predss = np.concatenate([test_each_predss, test_predss.reshape(-1, 1)], axis=1)


xgb01 [FOLD-0 SEED-1824] Recall score: 0.86364
xgb01 [FOLD-0 SEED-1824] Precision score: 0.76000
xgb01 [FOLD-0 SEED-1824] BalancedLogLoss score: 0.21750
--------------------------------------------------
xgb02 [FOLD-0 SEED-1824] Recall score: 0.81818
xgb02 [FOLD-0 SEED-1824] Precision score: 0.66667
xgb02 [FOLD-0 SEED-1824] BalancedLogLoss score: 0.25142
--------------------------------------------------
xgb1 [FOLD-0 SEED-1824] Recall score: 0.86364
xgb1 [FOLD-0 SEED-1824] Precision score: 0.67857
xgb1 [FOLD-0 SEED-1824] BalancedLogLoss score: 0.28585
--------------------------------------------------
xgb2 [FOLD-0 SEED-1824] Recall score: 0.90909
xgb2 [FOLD-0 SEED-1824] Precision score: 0.71429
xgb2 [FOLD-0 SEED-1824] BalancedLogLoss score: 0.23401
--------------------------------------------------
xgb3 [FOLD-0 SEED-1824] Recall score: 0.77273
xgb3 [FOLD-0 SEED-1824] Precision score: 0.80952
xgb3 [FOLD-0 SEED-1824] BalancedLogLoss score: 0.25061
----------------------------------------

In [5]:
# Calculate the mean score of the ensemble
mean_score = np.mean(ensemble_score)
std_score = np.std(ensemble_score)
print(f'Mean Optuna Ensemble {mean_score:.5f} ± {std_score:.5f} \n')

print('--- Optuna Weights---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')
stack_test_predss = np.zeros((X_test.shape[0]))
stack_scores = []
stack_models = []
splitter = Splitter(kfold=kfold, n_splits=n_splits, greeks=greeks.iloc[:,1:-1])
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(oof_each_predss, y_train, random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits
    
    classifier = Classifier(n_estimators, device, random_state_list[m])
    models = classifier.models
    model = models['xgb02']
    
    train_w0, train_w1 = calc_log_loss_weight(y_train_)
    valid_w0, valid_w1 = calc_log_loss_weight(y_val)
    
    model.fit(
    X_train_, y_train_, sample_weight=y_train_.map({0: train_w0, 1: train_w1}),
    eval_set=[(X_val, y_val)],
   # eval_metric='logloss',
    sample_weight_eval_set=[y_val.map({0: valid_w0, 1: valid_w1})],
    early_stopping_rounds=early_stopping_rounds,
    verbose=verbose
)
    
    test_pred = model.predict_proba(test_each_predss)[:, 1].reshape(-1)
    y_val_pred = model.predict_proba(X_val)[:, 1].reshape(-1)

    score = balanced_log_loss(y_val, y_val_pred)
    stack_scores.append(score)
    stack_models.append(deepcopy(model))
    
    stack_test_predss += test_pred / (n_splits * len(random_state_list))

# Calculate the mean LogLoss score of the ensemble
mean_score = np.mean(ensemble_score)
std_score = np.std(ensemble_score)
print(f'Ensemble BalancedLogLoss score {mean_score:.5f} ± {std_score:.5f}')
# Print the mean and standard deviation of the ensemble weights for each model
print('--- Model Weights ---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')
print('')

# Calculate the mean LogLoss score of the ensemble
mean_score = np.mean(stack_scores)
std_score = np.std(stack_scores)
print(f'Stacking BalancedLogLoss score {mean_score:.5f} ± {std_score:.5f}\n')

stackedxgbs = pd.read_csv(os.path.join(filepath, 'sample_submission.csv'))

stackedxgbs['class_1'] = stack_test_predss
stackedxgbs['class_0'] = 1 - stack_test_predss
stackedxgbs.to_csv('stackedxgbs0.15.csv', index=False)
stackedxgbs

Mean Optuna Ensemble 0.22436 ± 0.05852 

--- Optuna Weights---
xgb01: 0.42230 ± 0.42454
xgb02: 0.30196 ± 0.38023
xgb1: 0.24492 ± 0.32594
xgb2: 0.33192 ± 0.36260
xgb3: 0.18813 ± 0.32430
xgb5: 0.00408 ± 0.00369
svc: 0.07584 ± 0.10598
brf: 0.02079 ± 0.05218
Ensemble BalancedLogLoss score 0.22436 ± 0.05852
--- Model Weights ---
xgb01: 0.42230 ± 0.42454
xgb02: 0.30196 ± 0.38023
xgb1: 0.24492 ± 0.32594
xgb2: 0.33192 ± 0.36260
xgb3: 0.18813 ± 0.32430
xgb5: 0.00408 ± 0.00369
svc: 0.07584 ± 0.10598
brf: 0.02079 ± 0.05218

Stacking BalancedLogLoss score 0.20853 ± 0.06418



Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.657909,0.342091
1,010ebe33f668,0.657909,0.342091
2,02fa521e1838,0.657909,0.342091
3,040e15f562a2,0.657909,0.342091
4,046e85c7cc7f,0.657909,0.342091


# model 3 -> tabpfn + xgboost

In [6]:
%%capture

!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [7]:
# Improved version of https://www.kaggle.com/code/maximecapelle/ensemble-xgboost-tabpfn-without-normalization?scriptVersionId=137415102
import os
import pandas as pd
import numpy as np

#Import Preprocessing tools
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold

#Import Classifiers
from sklearn.ensemble import RandomForestClassifier 
import xgboost as xgb
from tabpfn import TabPFNClassifier

#Import training libraries
import tqdm
from sklearn.metrics import log_loss, accuracy_score

device_name = 'cuda:0'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Data & Submission
DATA_PATH = '/kaggle/input/icr-identify-age-related-conditions'
SUBMISSION_PATH = '/kaggle/working/tabpfn_fixed0.17.csv'

# Preprocessing

DROP_COLUMNS = ['Id', 'Class']

#Models
IMPUTE_STRAT = 'median'
KFOLD_SPLITS = 5
CLASSIFIERS = [
    xgb.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
    xgb.XGBClassifier(),
    TabPFNClassifier(N_ensemble_configurations=24,device = device_name),
    TabPFNClassifier(N_ensemble_configurations=64,device = device_name)]

# CLASSIFIERS = [TabPFNClassifier(N_ensemble_configurations=64)]
    
#Load in train and test data
df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
greeks_df  = pd.read_csv(os.path.join(DATA_PATH, 'greeks.csv'))

FEATURE_COLUMNS = [i for i in df.columns if i not in DROP_COLUMNS]
FEATURE_COLUMNS_GREEKS = [i for i in greeks_df.columns if i not in DROP_COLUMNS]
BinaryCategory = df.EJ.unique()[0]

# .eq(): Goes through the dataframe and sets a value if they are the same or not. (Can choose astype())
df.EJ = df.EJ.eq(BinaryCategory).astype('int')
test_df.EJ = test_df.EJ.eq(BinaryCategory).astype('int')
from datetime import date, datetime
times = greeks_df.Epsilon.copy()
times[greeks_df.Epsilon != 'Unknown'] = greeks_df.Epsilon[greeks_df.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks_df.Epsilon == 'Unknown'] = np.nan

df = pd.concat((df, times), 1)
test_df = np.array(test_df[FEATURE_COLUMNS])
test_df_extra = np.concatenate((test_df, np.zeros((len(test_df),1)) + df.Epsilon.max()+1),1)
# KF = KFold(n_splits = KFOLD_SPLITS, shuffle=True)
KF = StratifiedKFold(n_splits=KFOLD_SPLITS, random_state=118, shuffle=True)


class Ensemble():
    def __init__(self, IMPUTE_STRAT, CLASSIFIERS):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy=IMPUTE_STRAT)
        self.classifiers = CLASSIFIERS

    #Train all classifiers independantly   
    def fit(self, X, y):
        cls, y = np.unique(y, return_inverse=True)
        self.classes_ = cls
        X = self.imputer.fit_transform(X)
        for cl in self.classifiers:
            cl.fit(X,y)
     
    def predict_proba(self, X):
        X = self.imputer.transform(X)
        model_probabilities = np.stack([classifier.predict_proba(X) for classifier in self.classifiers])
        averaged_probabilities = np.mean(model_probabilities, axis=0)
        
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        Rest_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else Rest_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1)
# Loop through each fold
Losses = []
MODELS = {}

FEATURE_COLUMNS = [i for i in df.columns if i not in DROP_COLUMNS]

def MakeDatasets(df, train_index, valid_index, FEATURE_COLUMNS):
        # Fetch values corresponding to the index 
        train_df = df.iloc[train_index]
        valid_df = df.iloc[valid_index]
        
        # Select only feature columns for training.
        
        X_train = train_df[FEATURE_COLUMNS]
        X_val = valid_df[FEATURE_COLUMNS]
        y_train = train_df['Class']        
        y_val = valid_df['Class']

        return X_train, X_val, y_train, y_val 


def Run_Training(df, Losses, model):
#         for i, (train_index, valid_index) in enumerate(KF.split(X=df)):
        for i, (train_index, valid_index) in enumerate(KF.split(df, df['Class'])):

                print(f'##### Fold_{i+1}')

                #Make Datasets
                X_train, X_val, y_train, y_val = MakeDatasets(df, train_index, valid_index, FEATURE_COLUMNS)
                
                #Train Model        
                model.fit(X_train, y_train)

                #Store the Model in Dictionary
                MODELS[f"fold_{i+1}"] = model
                
                #Predict classes of validation data
                y_pred = model.predict_proba(X_val)

                loss = balanced_log_loss(y_val, y_pred)
                print(f"Loss: {loss}")
                Losses.append(loss)
        
        # Choose best model
        BestModelIndex = np.argmin(Losses)
        BestModel = MODELS[f"fold_{BestModelIndex+1}"]
        print(f'(Best Performance [Fold_{BestModelIndex+1}]) Loss: {Losses[BestModelIndex]}')

        return BestModel
Ens = Ensemble(IMPUTE_STRAT, CLASSIFIERS)
df['Epsilon'] = df.Epsilon.astype(float)
BestModel = Run_Training(df, Losses, Ens)

tabpfn_xgboost = BestModel.predict_proba(test_df_extra)
print(tabpfn_xgboost)

sample_submission = pd.read_csv(os.path.join(DATA_PATH, "sample_submission.csv"))
sample_submission[['class_0', 'class_1']] = tabpfn_xgboost
sample_submission.to_csv(SUBMISSION_PATH, index=False)
sample_submission

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
##### Fold_1
Loss: 0.44952469194926603
##### Fold_2
Loss: 0.11128156149906934
##### Fold_3
Loss: 0.14789071859328523
##### Fold_4
Loss: 0.14473490064391187
##### Fold_5
Loss: 0.15094455133133186
(Best Performance [Fold_2]) Loss: 0.11128156149906934




[[0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]]


Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


# model 4 -> tabpfn+xgboost int 

0.14 lb -> https://www.kaggle.com/code/jerryzheng111/icr-xgb-tabpfn-2kf

In [8]:
%%time

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

warnings.filterwarnings("ignore")
run_para = 'kaggle'
if run_para == 'kaggle':
    train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
    test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
    sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
    greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
elif run_para == 'local':
    train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
    test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
    sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
    greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')
    
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)
from sklearn.cluster import KMeans

k = 5
if run_para == 'kaggle':
    BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
elif run_para == 'local':
    BNpd = train['BN']

BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
data = BNpd.values.reshape(-1, 1)
kmodel = KMeans(n_clusters=k)           # k为聚成几类
kmodel.fit(data)  # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) #求聚类中心
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(train['BN'].max(), test['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

train_BN = train['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True)
train['BN_binning'] = train_binning

test_BN = test['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True)
test['BN_binning'] = test_binning
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan
train_pred_and_time = pd.concat((train, times, greeks.Alpha), axis=1)
train_cate = train_pred_and_time.iloc[:, -1]        # A, B, D, G
train_pred_and_time = train_pred_and_time.drop(train_pred_and_time.columns[-1], axis=1)

test_predictors = test[predictor_columns]
test_time = np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1
test_pred_and_time = pd.concat((test_predictors, pd.DataFrame(test_time, columns=['Epsilon'])), axis=1)
y_true = np.array([1,1,1,0,0,0]).astype('int')
y_pred = np.array([1] * len(y_true)).astype('float64')
bll = balanced_log_loss(y_true, y_pred)
print(bll)
class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),

                           TabPFNClassifier(N_ensemble_configurations=24,device = device_name),
                           TabPFNClassifier(N_ensemble_configurations=64,device = device_name)]
    
    def fit(self, X, y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        # first_category = X.EJ.unique()[0]
        # X.EJ = X.EJ.eq(first_category).astype('int')
        
        X = self.imputer.fit_transform(X)
        for classifier in self.classifiers:
            if classifier == self.classifiers[2] or classifier == self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)

        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
from sklearn.model_selection import KFold as KF, GridSearchCV

cv_outer = KF(n_splits = 5, shuffle=True, random_state=19)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=19)
# 计算预测准确率

def calc_acc(y_pred, y):
    probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
    p1 = 1 - p0
    
    y = y.values.astype(int)
    cnt = 0

    for i in range(len(p0)):
        if p0[i] >= p1[i]:
            lab = 0
        else :
            lab = 1

        if lab == y[i]:
            cnt += 1

    return cnt / len(p0)
# 计算balanced log loss

def calc_loss(y_pred, y):
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
#     p0[p0 > 0.80] = 1
#     p0[p0 < 0.20] = 0

    p1 = 1 - p0
    
    y = y.values.astype(int)
    loss = balanced_log_loss(y, p1)

    return loss
from tqdm.notebook import tqdm

ros = RandomOverSampler(random_state=42)

def training(model, x, y, y_meta):
    low_loss = np.inf
    best_models = []
    for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(x), start=1):
        print(f'Now for outer fold {out_id}:')
        x_train_ori, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train_ori, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]

        x_train, y_train = ros.fit_resample(x_train_ori, y_train_ori)
        # x_train, y_train = x_train_ori, y_train_ori
        
        train_loss = np.zeros((x_train.shape[0], 4))
    
        out_X, out_y_meta = x_train, y_train
        out_y = out_y_meta.apply(lambda x: 0 if x == 'A' else 1)
        
        models = []

        for in_id, (train_idx1, val_idx1) in enumerate(cv_inner.split(out_X), start=1):
            in_x_train, in_x_val = out_X.iloc[train_idx1], out_X.iloc[val_idx1]
            in_y_train, in_y_val = out_y_meta.iloc[train_idx1], out_y.iloc[val_idx1]

            model.fit(in_x_train, in_y_train)
            models.append(model) 

            y_pred = model.predict_proba(in_x_val)
            train_loss[val_idx1] = y_pred

            metric = calc_loss(y_pred, in_y_val)
            print('Inner_fold = %.1f, val_loss = %.5f' % (in_id, metric))
        
        # 分别用models中的模型计算x_val的loss
        val_y_pred = np.zeros((x_val.shape[0], 4))
        for model in models:
            y_pred = model.predict_proba(x_val)
            val_y_pred += y_pred
        val_y_pred /= len(models)

        metric_train = calc_loss(train_loss, out_y)
        acc_train = calc_acc(train_loss, out_y)
        print(f'80% Train Loss: {metric_train}; Train Acc: {acc_train}')
        metric_val = calc_loss(val_y_pred, y_val)
        acc_val = calc_acc(val_y_pred, y_val)
        print(f'20% Val Loss: {metric_val}; Val Acc: {acc_val}\n')

        if metric_val < low_loss:
            low_loss = metric_val
            best_models = models

        # break       # 先只跑一次，节约时间    
        
    return best_models
x_ = train_pred_and_time.drop(['Class', 'Id'], axis=1)
y_ = train_pred_and_time.Class
y_meta_ = train_cate
yt = Ensemble()

models = training(yt, x_, y_, y_meta_)
# import pickle

# for cnt, model in enumerate(models, start=1):
#     filename = 'model' + str(cnt) + '.pkl'
#     with open(filename, 'wb') as f:
#         pickle.dump(model, f)

y_pred = np.zeros((test_pred_and_time.shape[0], 4))
for model in models:
    y_pred += model.predict_proba(test_pred_and_time)
y_pred = y_pred / len(models)

probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:,:1]
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('tabpfn14.csv', index=False)
submission


17.269787996170443
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Now for outer fold 1:
Inner_fold = 1.0, val_loss = 0.02166
Inner_fold = 2.0, val_loss = 0.02109
Inner_fold = 3.0, val_loss = 0.02444
Inner_fold = 4.0, val_loss = 0.01819
Inner_fold = 5.0, val_loss = 0.02331
80% Train Loss: 0.021675416863542032; Train Acc: 0.9981572481572482
20% Val Loss: 0.1755891333759563; Val Acc: 0.9112903225806451

Now for outer fold 2:
Inner_fold = 1.0, val_loss = 0.01819
Inner_fold = 2.0, val_loss = 0.01579
Inner_fold = 3.0, val_loss = 0.03180
Inner_fold = 4.0, val_loss = 0.01586
Inner_fold = 5.0, val_loss = 0.02813
80% Train Loss: 0.021835462233916275; Train Acc: 0.9993932038834952
20% Val Loss: 0.18057189059271497; Val Acc: 0.9274193548387096

Now for outer fold 3:
Inner_fold = 1.0, val_loss = 0.01606
Inner_fold = 2.0, val_loss = 0.02178
Inner_fold = 3.0, va

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


# model 5 -> fixed lightgbm public baseline lb 0.15

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import math

from sklearn.metrics import log_loss, make_scorer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

class FeatureCreator():
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            X_copy = X.copy()
            
            #X_copy = self.fill_na(X)
            X_copy = self.encoder(X_copy)
            X_copy = self.base(X_copy)
            return X_copy
        else:
            return X_copy
    def fill_na(self, X):
        return X.fillna(X.median())
    
    def base(self , X):
        try:
            X['out_GL'] = 0
            X.loc[X['GL']<1,'out_GL'] = X.loc[X['GL']<1,'GL'].map(lambda x : x-X.loc[X['GL']<1,'GL'].mean())
            X.loc[X['GL']>1.5,'out_GL'] = X.loc[X['GL']>1.5,'GL'].map(lambda x : x-X.loc[X['GL']>1.5,'GL'].mean())
            X.out_GL = X.out_GL.astype('float')
            X['DA*CS'] = np.log(X.DA*2 / X.CS**0.5)

            return X
        except:
            print('have missing columns')
            return X
    def encoder(self, X):
        try:
            X['EJ'] = X['EJ'].map({'A': 0, 'B': 1})
            return X
        except:
            return X
import os
import random
def seed_everything(seed=None):
    '''

    固定seed
    :param seed: int, 随机种子
    '''
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    return seed
seed_everything(42)

import pandas as pd

train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")

ft = ['Id','AB','BQ','FL','DU','DA','CR','FI','EE','DY','FD ','GL','DL','EL','CC','AF','BC','FR','DI','FC','EP','DN','CH','CD ','AM','DH','EB','EH','EU','CU','DE','GE','CL','FS','CS','GF','FE','EG','CB']

FT = FeatureCreator()

train = FT.transform(train)
test = FT.transform(test)

drop_ft = ['DI','FD ','CS'] 
train = train.drop(columns = drop_ft)#,'CU','DY','CB'
test = test.drop(columns = drop_ft)
def balanced_log_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    nc = np.bincount(y_true.astype(int))
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(1-y_pred))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred)))) / 2
    return logloss
def lgb_metric(y_true, y_pred):
    return 'balanced_log_loss', balanced_log_loss(y_true, y_pred), False

class BalancedLoglossObjective(object):
    def calc_ders_range(self, approxes, targets):
        prob = 1.0 / (1 + np.exp(-np.array(approxes)))
        nc = np.bincount(np.array(targets).astype(int))
        der1 = np.where(np.array(targets) == 0, -1 / nc[0] * (1 - prob), 1 / nc[1] * prob)
        der2 = np.where(np.array(targets) == 0, -1 / nc[0] * prob, 1 / nc[1] * (prob - 1) * prob)
        return list(zip(der1, der2))

class BalancedLoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, targets, weight):
        approxes = np.clip(approxes, 1e-15, 1-1e-15)
        nc = np.bincount(approxes.astype(int))
        logloss = (-1/nc[0]*(np.sum(np.where(targets==0,1,0) * np.log(1-approxes))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(approxes)))) / 2
        return logloss
import optuna
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,f1_score,precision_score,recall_score,accuracy_score
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.linear_model import LogisticRegression, Ridge
import lightgbm
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler
from lightgbm import plot_metric
import random
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from functools import partial



In [10]:
%%time
def cv(train, test, target,kfold = None):
    n_reapts = 1
    random_state = 42
    n_estimators = 99999
    n_trials = 3500
    early_stopping_rounds = 2000
    verbose = False
    device = 'cpu'
    ensemble_score = []
    fold_scores = []
    weights = []
    oof_each_predss = []
    test_each_predss = []
    ensemble_test = np.zeros(len(test))
    oof_predss = np.zeros(len(train))
    # Fix seed
    random.seed(random_state)
    random_state_list = random.sample(range(9999), n_reapts)
    
    test_preds = np.zeros((len(test)))
    cv = 0
    kf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=42)
    train_result = train.Class.copy()
    train_targets = train[target]
    train = train.drop(columns = [target])
    lgb_loss = 0
    ctb_loss = 0
    xgb_loss = 0
    final_result = []
    Ensemble_MAE = 0
    score = []

    
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train_targets)):
        print(f"------------> Fold {fold + 1} <-----------------")

        X_train, X_valid = train.iloc[train_idx], train.iloc[valid_idx]
        y_train, y_valid = train_targets.iloc[train_idx], train_targets.iloc[valid_idx]
        
        class_weights = [1, len(y_train[y_train == 0]) / len(y_train[y_train == 1])]
        

        lgb = LGBMClassifier(boosting_type='goss', learning_rate=0.06733232950390658, n_estimators = 50000, 
                         early_stopping_round = 300, random_state=42,
                        subsample=0.6970532011679706,
                        colsample_bytree=0.6055755840633003,
                         class_weight='balanced',
                         metric='none', is_unbalance=True, max_depth=8)#,reg_alpha = 0.1
        lgb.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=0,eval_metric=lgb_metric)
        # plot_metric(lgb,'mae')
        lgb_preds = lgb.predict_proba(X_valid)
        lgb_test_preds = lgb.predict_proba(test)
        
        print(f"|------------------------------------|")
        print(f"|                                    |")
        print(f"|            LightGBM                |")
        print(f"|                                    |")
        print(f"|------------------------------------|")
        print(" ")
        print(f"------------- Train -----------------")
        print(f"BLL: {balanced_log_loss(y_valid,lgb_preds[:,1])}")
        lgb_loss += balanced_log_loss(y_valid[:],lgb_preds[:,1])
        
        ctb_preds = np.zeros(len(X_valid))

        ctb_test_preds = np.zeros(len(test))
#         xgb_test_preds = np.zeros(len(test))
    
        meta_train = [lgb_preds]#ctb_preds, , xgb_preds
        meta_test = [lgb_test_preds]#ctb_test_preds, , xgb_test_preds
        
        final_result.append(meta_test)
        train_result[valid_idx] = lgb_preds[:,1]
        # weights.append(2/(lgb_loss+xgb_loss))
        weights.append(1/(balanced_log_loss(y_valid,lgb_preds[:,1])))
        score.append(balanced_log_loss(y_valid,lgb_preds[:,1]))
    print(f'feature is {X_valid.columns.values}')
    print(f"Average ctb is : {ctb_loss/kfold} ; lgb is : {lgb_loss/kfold} ; xgb is : {xgb_loss/kfold}")
    print(f"std is {np.std(score)}")
    print(f"total is {np.std(score) + lgb_loss/kfold}")
    return train_result,final_result,ensemble_test,weights
meta_train,meta_test,test_pred,weight = cv(train = train.drop(columns = ['Id']), test = test.drop(columns = ['Id']), target = 'Class',kfold=10)  


------------> Fold 1 <-----------------
|------------------------------------|
|                                    |
|            LightGBM                |
|                                    |
|------------------------------------|
 
------------- Train -----------------
BLL: 0.23655023126337382
------------> Fold 2 <-----------------
|------------------------------------|
|                                    |
|            LightGBM                |
|                                    |
|------------------------------------|
 
------------- Train -----------------
BLL: 0.08871381298461571
------------> Fold 3 <-----------------
|------------------------------------|
|                                    |
|            LightGBM                |
|                                    |
|------------------------------------|
 
------------- Train -----------------
BLL: 0.28279257535637264
------------> Fold 4 <-----------------
|------------------------------------|
|                    

In [11]:
fit_value = pd.DataFrame({'pred':meta_train,'Class':train.Class})
len(fit_value.loc[(fit_value['pred']>0.9) & (fit_value['Class']==1),:])
print(f"pred>0.9 is {len(fit_value.loc[(fit_value['pred']>0.9) & (fit_value['Class']==1),:])} {round(len(fit_value.loc[(fit_value['pred']>0.9) & (fit_value['Class']==1),:]) / len(fit_value.loc[fit_value.Class==1]),3)}")
print(f"pred<0.1 is {len(fit_value.loc[(fit_value['pred']<0.1) & (fit_value['Class']==0),:])} {round(len(fit_value.loc[(fit_value['pred']<0.1) & (fit_value['Class']==0),:]) / len(fit_value.loc[fit_value.Class==0]),3)}")
test_preds = np.zeros((test.shape[0],2))
for i in range(10):
    test_preds[:, 0] += weight[i] * meta_test[i][0][:, 0]
    test_preds[:, 1] += weight[i] * meta_test[i][0][:, 1]
test_preds /= sum(weight)
submission = pd.concat([test.Id,pd.DataFrame(test_preds,columns = ['class_0', 'class_1'])],axis=1)
submission.to_csv(r"public_lgb15.csv", index=False)


pred>0.9 is 73 0.676
pred<0.1 is 416 0.817


based on the findings on this notebook https://www.kaggle.com/code/raddar/icr-competition-analysis-and-findings/notebook

Let me summarize the key findings from each chapter:

Chapter 1: BN (age) is a crucial feature, and patients under the age of 44 (5% of the training data) are always Class 0 (healthy).

Chapter 2: The column BQ is significant. If BQ is None, Class is always 0. This represents 10% of the population and indicates possible data drift.

Chapter 3: The population is segmented by EJ. When EJ == A, some columns are constant, and EH is always 0.5. This means there is no reason to encode EJ, and it can be dropped from models. The mean Class=1 rate fluctuates less for EJ=A compared to EJ=B, suggesting data stratification.

Chapter 4: Class=1 rate drifts over time. Typical cross-validation may not be valid due to the lack of real weights and varying class_weights for binary logloss components. Optimizing class_weights based on LB feedback may be worth considering.

Chapter 5: Very few hard samples exist in the data, representing about 7% of positive samples. These hard samples have a significant impact on fold scores, making the problem challenging.

Chapter 6: Public LB contains only a few hard cases, which suggests that testing models without overrides in the public LB is practically useless. Public LB's small number of hard cases may not be representative of the private test set.

Chapter 7: Using pseudolabeling on the public LB and adopting a 2-stage modeling approach may be helpful for competition strategy.



In [12]:
import pandas as pd
import numpy as np

# Load the test.csv file
COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
test = pd.read_csv(f"{COMP_PATH}/test.csv")

int_denominators = {
 'AB': 0.004273,
 'AF': 0.00242,
 'AH': 0.008709,
 'AM': 0.003097,
 'AR': 0.005244,
 'AX': 0.008859,
 'AY': 0.000609,
 'AZ': 0.006302,
 'BC': 0.007028,
 'BD ': 0.00799,
 'BN': 0.3531,
 'BP': 0.004239,
 'BQ': 0.002605,
 'BR': 0.006049,
 'BZ': 0.004267,
 'CB': 0.009191,
 'CC': 6.12e-06,
 'CD ': 0.007928,
 'CF': 0.003041,
 'CH': 0.000398,
 'CL': 0.006365,
 'CR': 7.5e-05,
 'CS': 0.003487,
 'CU': 0.005517,
 'CW ': 9.2e-05,
 'DA': 0.00388,
 'DE': 0.004435,
 'DF': 0.000351,
 'DH': 0.002733,
 'DI': 0.003765,
 'DL': 0.00212,
 'DN': 0.003412,
 'DU': 0.0013794,
 'DV': 0.00259,
 'DY': 0.004492,
 'EB': 0.007068,
 'EE': 0.004031,
 'EG': 0.006025,
 'EH': 0.006084,
 'EL': 0.000429,
 'EP': 0.009269,
 'EU': 0.005064,
 'FC': 0.005712,
 'FD ': 0.005937,
 'FE': 0.007486,
 'FI': 0.005513,
 'FR': 0.00058,
 'FS': 0.006773,
 'GB': 0.009302,
 'GE': 0.004417,
 'GF': 0.004374,
 'GH': 0.003721,
 'GI': 0.002572
}
for k, v in int_denominators.items():
    test[k] = np.round(test[k]/v,1)

In [13]:
import pandas as pd

# Read the CSV files
lgbm16 = pd.read_csv('/kaggle/working/lgbm0.16.csv')
stackedxgbs15 = pd.read_csv('/kaggle/working/stackedxgbs0.15.csv')
tabpfn_fixed17 = pd.read_csv('/kaggle/working/tabpfn_fixed0.17.csv')
tabpfn14 = pd.read_csv('/kaggle/working/tabpfn14.csv')
public_lgb15 = pd.read_csv('/kaggle/working/public_lgb15.csv')

# Combine the 'Class_0' columns from all dataframes
combined_class_0 = pd.concat([lgbm16['Class_0'], public_lgb15['class_0'], tabpfn14['class_0'], stackedxgbs15['class_0'], tabpfn_fixed17['class_0']], axis=1)

# Calculate the weighted ensemble based on inverse of log losses
log_losses = {
    'lgbm16': 0.16,
    'stackedxgbs15': 0.15,
    'tabpfn_fixed17': 0.17,
    'tabpfn14': 0.14,
    'public_lgb15': 0.15
}

# Calculate weights based on the inverse of log loss
weights = [1 / loss for loss in log_losses.values()]
total_weight = sum(weights)
weights = [weight / total_weight for weight in weights]

weighted_class_0 = (combined_class_0 * weights).sum(axis=1) / len(log_losses)

# Apply post-processing to 'Class_0'
# weighted_class_0[(weighted_class_0 < 0.02)] = 0
# weighted_class_0[(weighted_class_0 > 0.95)] = 1

# Calculate 'Class_1' as 1 - 'Class_0'
weighted_class_1 = 1 - weighted_class_0

# Create a new dataframe for submission
submission_df = pd.DataFrame({'Id': lgbm16['Id'], 'Class_0': weighted_class_0, 'Class_1': weighted_class_1})

# Apply post-processing to 'Class_0' and 'Class_1' predictions based on 'test.csv'
for i, row in test.iterrows():
    if row['BN'] <= 44:  # Chapter 1 finding
        submission_df.loc[i, 'Class_0'] = 1
        submission_df.loc[i, 'Class_1'] = 0
    if pd.isnull(row['BQ']):  # Chapter 2 finding
        submission_df.loc[i, 'Class_0'] = 1
        submission_df.loc[i, 'Class_1'] = 0

# Save the updated dataframe as submission.csv
# submission_df.to_csv('submission.csv', index=False)
weighted_class_0

0    0.120359
1    0.120359
2    0.120359
3    0.120359
4    0.120359
dtype: float64

In [14]:
import os

folder_path = "/kaggle/working/"
files = os.listdir(folder_path)

# Delete CSV files
for file in files:
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        os.remove(file_path)

print("CSV files deleted successfully.")


CSV files deleted successfully.


In [15]:
# Save the updated dataframe as submission.csv
submission_df.to_csv('submission.csv', index=False)
submission_df

Unnamed: 0,Id,Class_0,Class_1
0,00eed32682bb,1.0,0.0
1,010ebe33f668,1.0,0.0
2,02fa521e1838,1.0,0.0
3,040e15f562a2,1.0,0.0
4,046e85c7cc7f,1.0,0.0
