# Reference : https://www.kaggle.com/code/aikhmelnytskyy/public-krni-pdi-with-two-additional-models
# Description: Added some comments in the referenced code.
# Please vote if it was helpful to you. :)

In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [None]:
import numpy as np                       # NumPy for numerical computations
import pandas as pd                      # Pandas for data manipulation and analysis
from sklearn.preprocessing import LabelEncoder, normalize   # LabelEncoder for encoding categorical variables, normalize for feature scaling
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier   # GradientBoostingClassifier and RandomForestClassifier for classification models
from sklearn.metrics import accuracy_score   # accuracy_score for evaluating model performance
from sklearn.impute import SimpleImputer   # SimpleImputer for handling missing values
import imblearn   # imblearn for imbalanced dataset handling
from imblearn.over_sampling import RandomOverSampler   # RandomOverSampler for oversampling minority class
from imblearn.under_sampling import RandomUnderSampler   # RandomUnderSampler for undersampling majority class
import xgboost   # XGBoost for gradient boosting models
import inspect   # inspect for retrieving information about live objects
from collections import defaultdict   # defaultdict for creating a dictionary with default values
from tabpfn import TabPFNClassifier   # TabPFNClassifier for a specific classification model
import warnings   # warnings for ignoring warnings during runtime

In [None]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

In [None]:
# lb = LabelEncoder()
# train['EJ'] = lb.fit_transform(train['EJ']).astype(float)
# test['EJ'] = lb.fit_transform(test['EJ']).astype(float)

In [None]:
# Assign the first unique value of the 'EJ' column in the 'train' dataframe to the variable 'first_category'
first_category = train.EJ.unique()[0]

# Convert the values in the 'EJ' column of the 'train' dataframe to binary values (0 or 1),
# where 1 represents the occurrence of the 'first_category' and 0 represents other categories
train.EJ = train.EJ.eq(first_category).astype('int')

# Convert the values in the 'EJ' column of the 'test' dataframe to binary values (0 or 1),
# based on the occurrence of the 'first_category' in the 'train' dataframe
test.EJ = test.EJ.eq(first_category).astype('int')

In [None]:
train['Class'].value_counts()

In [None]:
def random_under_sampler(df):
    # Calculate the number of samples for each label. 
    neg, pos = np.bincount(df['Class'])

    # Choose the samples with class label `1`.
    one_df = df.loc[df['Class'] == 1] 
    # Choose the samples with class label `0`.
    zero_df = df.loc[df['Class'] == 0]
    # Select `pos` number of negative samples.
    # This makes sure that we have equal number of samples for each label.
    zero_df = zero_df.sample(n=pos)

    # Join both label dataframes.
    undersampled_df = pd.concat([zero_df, one_df])

    # Shuffle the data and return
    return undersampled_df.sample(frac = 1)

In [None]:
# Perform random under-sampling on the 'train' dataset and assign the result to the variable 'train_good'
train_good = random_under_sampler(train)

In [None]:
# Print the shape of the 'train_good' dataset
train_good.shape

In [None]:
# Create a list called 'predictor_columns' that contains all column names from the 'train' dataframe
# except for the columns named 'Class' and 'Id'
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']

# Create a new dataframe 'x' that contains the subset of columns specified in the 'predictor_columns' list
x = train[predictor_columns]

# Create a new series 'y' that contains the values from the 'Class' column in the 'train' dataframe
y = train['Class']

In [None]:
# x_norm = np.array(x_norm)
# y_ros = np.array(y_ros)

In [None]:
# Import necessary libraries for model evaluation and hyperparameter tuning
from sklearn.model_selection import KFold as KF, GridSearchCV

# Create an outer K-Fold cross-validation object with 10 splits
# Shuffle the data before splitting and set the random state for reproducibility
cv_outer = KF(n_splits=10, shuffle=True, random_state=42)

# Create an inner K-Fold cross-validation object with 5 splits
# Shuffle the data before splitting and set the random state for reproducibility
cv_inner = KF(n_splits=5, shuffle=True, random_state=42)

In [None]:
def balanced_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # calculate the weights for each class to balance classes
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    # calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # calculate the summed log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    # calculate the weighted summed logarithmic loss
    # (factgor of 2 included to give same result as LL with balanced input)
    balanced_log_loss = 2*(w_0 * log_loss_0 + w_1 * log_loss_1) / (w_0 + w_1)
    # return the average log loss
    return balanced_log_loss/(N_0+N_1)

In [None]:
class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.2,subsample=0.9,colsample_bytree=0.85),
                          
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=24),
                          
                          TabPFNClassifier(N_ensemble_configurations=64)]
    
    def fit(self,X,y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        first_category = X.EJ.unique()[0]
        X.EJ = X.EJ.eq(first_category).astype('int')
        X = self.imputer.fit_transform(X)
#         X = normalize(X,axis=0)
        for classifier in self.classifiers:
            if classifier==self.classifiers[2] or classifier==self.classifiers[3]:
                classifier.fit(X,y,overwrite_warning =True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)
#         x = normalize(x,axis=0)
        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 

In [None]:
# Import the 'tqdm' module from the 'tqdm.notebook' package
from tqdm.notebook import tqdm

In [None]:
def training(model, x, y, y_meta):
    outer_results = list()   # List to store the loss results of each outer fold
    best_loss = np.inf   # Variable to track the best loss achieved during training
    split = 0   # Counter for the current split
    splits = 5   # Total number of splits for the inner cross-validation
    models = []   # List to store the trained models for each inner fold

    # Loop over the splits of the inner cross-validation using tqdm for progress visualization
    for train_idx, val_idx in tqdm(cv_inner.split(x), total=splits):
        split += 1   # Increment the split counter
        x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]   # Split the training data into training and validation sets
        y_train, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]   # Split the target variable into training and validation sets

        model.fit(x_train, y_train)   # Fit the model on the training data
        models.append(model)   # Append the trained model to the list of models

        y_pred = model.predict_proba(x_val)   # Predict probabilities for the validation set
        probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)   # Calculate class probabilities

        p0 = probabilities[:, :1]   # Extract probabilities for the first class
        p0[p0 > 0.86] = 1   # Set probabilities greater than 0.86 to 1 (True)
        p0[p0 < 0.14] = 0   # Set probabilities less than 0.14 to 0 (False)

        y_p = np.empty((y_pred.shape[0],))   # Create an empty array to store predicted labels

        # Convert probabilities to binary labels based on a threshold of 0.5
        for i in range(y_pred.shape[0]):
            if p0[i] >= 0.5:
                y_p[i] = False
            else:
                y_p[i] = True

        y_p = y_p.astype(int)   # Convert the predicted labels to integer format
        loss = balanced_log_loss(y_val, y_p)   # Calculate the balanced log loss between the predicted labels and the true labels

        if loss < best_loss:
            best_model = model   # Save the best model based on the lowest loss
            best_loss = loss   # Update the best loss
            print('best_model_saved')

        outer_results.append(loss)   # Append the loss to the list of outer results
        print('>val_loss=%.5f, split = %.1f' % (loss, split))

    print('LOSS: %.5f' % (np.mean(outer_results)))   # Print the average loss across all outer folds
    return best_model, models   # Return the best model and the list of trained models

In [None]:
# Import the 'datetime' module from the standard library
from datetime import datetime

# Create a copy of the 'Epsilon' column from the 'greeks' dataframe and assign it to the variable 'times'
times = greeks.Epsilon.copy()

# Replace non-'Unknown' values in the 'Epsilon' column with their corresponding ordinal date values
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x, '%m/%d/%Y').toordinal())

# Replace 'Unknown' values in the 'Epsilon' column with NaN (missing value)
times[greeks.Epsilon == 'Unknown'] = np.nan

In [None]:
# Concatenate the 'train' dataframe and the 'times' series along the columns (axis=1) and assign it to 'train_pred_and_time'
train_pred_and_time = pd.concat((train, times), axis=1)

# Select the predictor columns from the 'test' dataframe and assign it to 'test_predictors'
test_predictors = test[predictor_columns]

# Get the first unique value from the 'EJ' column in the 'test_predictors' dataframe and assign it to 'first_category'
first_category = test_predictors.EJ.unique()[0]

# Convert the values in the 'EJ' column of the 'test_predictors' dataframe to binary values (0 or 1),
# where 1 represents the occurrence of the 'first_category' and 0 represents other categories
test_predictors.EJ = test_predictors.EJ.eq(first_category).astype('int')

# Create a new array 'test_pred_and_time' by concatenating 'test_predictors' and a column of zeros
# with a shape of (len(test_predictors), 1). The column is adjusted to be greater than the maximum value in 'train_pred_and_time.Epsilon'
test_pred_and_time = np.concatenate((test_predictors, np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1), axis=1)

In [None]:
# Create a RandomOverSampler object with a random state of 42
ros = RandomOverSampler(random_state=42)

# Resample the 'train_pred_and_time' dataframe and 'greeks.Alpha' series using RandomOverSampler
# The resampled data is assigned to 'train_ros' and 'y_ros' respectively
train_ros, y_ros = ros.fit_resample(train_pred_and_time, greeks.Alpha)

# Print the value counts of the 'Alpha' column in the original dataset
print('Original dataset shape')
print(greeks.Alpha.value_counts())

# Print the value counts of the 'y_ros' series in the resampled dataset
print('Resample dataset shape')
print(y_ros.value_counts())

In [None]:
# Create a new dataframe 'x_ros' by dropping the 'Class' and 'Id' columns from the 'train_ros' dataframe
x_ros = train_ros.drop(['Class', 'Id'], axis=1)

# Assign the 'Class' column from the 'train_ros' dataframe to the variable 'y_'
y_ = train_ros.Class

In [None]:
yt = Ensemble()

In [None]:
# Call the 'training' function with the input parameters 'yt', 'x_ros', 'y_', and 'y_ros'
# The returned values 'm' and 'models' are assigned to the respective variables
m, models = training(yt, x_ros, y_, y_ros)

In [None]:
# Calculate the class distribution by dividing the value counts of 'y_' by the total number of samples in 'y_'
y_.value_counts() / y_.shape[0]

In [None]:
y_pred = m.predict_proba(test_pred_and_time)
#y_pred_list = []
#for m in models:
#    y_pred_list.append(m.predict_proba(test_pred_and_time))

In [None]:
y_pred

In [None]:
# Concatenate the first column of 'y_pred' with the sum of the remaining columns along axis 1,
# and assign the result to the 'probabilities' array
probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)

# Extract the first column of 'probabilities' and assign it to the 'p0' array
p0 = probabilities[:, :1]

# Set values in 'p0' that are greater than 0.60 to 1 (True)
p0[p0 > 0.59] = 1

# Set values in 'p0' that are less than 0.25 to 0 (False)
p0[p0 < 0.28] = 0

In [None]:
p0

In [None]:
# Create a new DataFrame 'submission' with the 'Id' column from the 'test' DataFrame
submission = pd.DataFrame(test["Id"], columns=["Id"])

# Add a new column 'class_0' to the 'submission' DataFrame containing the values from 'p0'
submission["class_0"] = p0

# Add a new column 'class_1' to the 'submission' DataFrame containing the complement values of 'p0' (1 - p0)
submission["class_1"] = 1 - p0

# Save the 'submission' DataFrame to a CSV file named 'submission.csv' without including the index column
submission.to_csv('submission.csv', index=False)

In [None]:
submission

In [None]:
pd.set_option('display.precision', 8)

submission['class_1'] = submission['class_1'].apply(lambda x: '{:.8f}'.format(x))
print(submission)