In [1]:
# Define balanced log loss function
from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight=1/nc[y_true], eps=1e-15)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier

# Define the number of bags and folds
bag_num = 5
n_fold = 10

# Define the feature selection method
k = 30  # Number of top features to select
selector = SelectKBest(f_classif, k=k)

# Define the competition log loss metric
def competition_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1
    return (log_loss_0 + log_loss_1) / 2

# Load the data
COMP_PATH = "/kaggle/input/icr-identify-age-related-conditions"
train = pd.read_csv(f"{COMP_PATH}/train.csv")
test = pd.read_csv(f"{COMP_PATH}/test.csv")

# Perform label encoding
train['EJ'] = train['EJ'].map({'A': 0, 'B': 1})
test['EJ'] = test['EJ'].map({'A': 0, 'B': 1})

# Prepare the data
df = train.copy()
test_df = test.copy()
feas_cols = [col for col in df.columns if col not in ['Id', 'Class']]

# Define the imputer
imputer = SimpleImputer(strategy='mean')

# Apply mean imputation on the data
df[feas_cols] = imputer.fit_transform(df[feas_cols])
test_df[feas_cols] = imputer.transform(test_df[feas_cols])

# Define the LGBM parameters
lgbm_params = {
    "boosting_type": 'goss',
    "learning_rate": 0.06733232950390658,
    "n_estimators": 50000,
    "early_stopping_round": 300,
    "random_state": 118,
    "subsample": 0.8,
    "colsample_bytree": 0.6055755840633003,
    "class_weight": 'balanced',
    "metric": 'logloss',
    "is_unbalance": True,
    "max_depth": 12
}

# Initialize lists to store models and log losses
models = []
bag_log_losses = []
feature_importance_df_total = pd.DataFrame()

# Perform bagging and feature selection
for bag in range(bag_num):
    print(f'########################## bag: {bag} ##########################')
    kf = StratifiedKFold(n_splits=n_fold, random_state=118*bag, shuffle=True)
    fold_losses = []
    for fold, (train_idx, test_idx) in enumerate(kf.split(df, df['Class'])):
        train_df = df.iloc[train_idx]
        valid_df = df.iloc[test_idx]
        valid_ids = valid_df.Id.values.tolist()

        X_train, y_train = train_df[feas_cols], train_df['Class']
        X_valid, y_valid = valid_df[feas_cols], valid_df['Class']

        # Perform feature selection
        X_train_selected = selector.fit_transform(X_train, y_train)
        X_valid_selected = selector.transform(X_valid)
        test_df_selected = selector.transform(test_df[feas_cols])

        # Update feature columns
        feas_cols_selected = [feas_cols[i] for i in selector.get_support(indices=True)]
        feas_cols = feas_cols_selected

        lgb = LGBMClassifier(**lgbm_params)
        lgb.fit(X_train_selected, y_train, eval_set=(X_valid_selected, y_valid), verbose=False,
                eval_metric=lambda y_true, y_pred: ('logloss', competition_log_loss(y_true, y_pred), False))

        models.append(lgb)

        # Calculate feature importances
        feature_importances = lgb.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': feas_cols, 'Importance': feature_importances})
        feature_importance_df['Bag'] = bag
        feature_importance_df['Fold'] = fold
        feature_importance_df_total = pd.concat([feature_importance_df_total, feature_importance_df], axis=0)

        y_pred = lgb.predict_proba(X_valid_selected)
        fold_loss = log_loss(y_valid, y_pred)
        fold_losses.append(fold_loss)
        
        print(f"Total train: {len(train_df)}, Total valid: {len(valid_df)}, Bags: {bag}, Fold: {fold}, Log Loss: {fold_loss:.4f}")
    
    avg_fold_loss = np.mean(fold_losses)
    bag_log_losses.append(avg_fold_loss)
    print(f"Average Log Loss for Bag {bag}: {avg_fold_loss:.4f}")

avg_loss = np.mean(bag_log_losses)
print(f"Average Log Loss after Full Training: {avg_loss:.4f}")

# Calculate weights based on inverse of log loss
weights = [1 / loss for loss in bag_log_losses]
total_weight = sum(weights)
weights = [weight / total_weight for weight in weights]

# Prepare submission dataframe
lgbm_preds = np.zeros(len(test_df))
for bag, weight in zip(range(bag_num), weights):
    for fold in range(n_fold):
        clf = models[bag * n_fold + fold]
        lgbm_preds += weight * clf.predict_proba(test_df_selected)[:, 1] / n_fold

lgbm = test_df[['Id']].copy()
lgbm['Class_0'] = 1 - lgbm_preds
lgbm['Class_1'] = lgbm_preds
# lgbm.to_csv('lgbm0.16.csv', index=False)
lgbm.head()


########################## bag: 0 ##########################




Total train: 555, Total valid: 62, Bags: 0, Fold: 0, Log Loss: 0.3310




Total train: 555, Total valid: 62, Bags: 0, Fold: 1, Log Loss: 0.0966




Total train: 555, Total valid: 62, Bags: 0, Fold: 2, Log Loss: 0.1476




Total train: 555, Total valid: 62, Bags: 0, Fold: 3, Log Loss: 0.2905




Total train: 555, Total valid: 62, Bags: 0, Fold: 4, Log Loss: 0.1963




Total train: 555, Total valid: 62, Bags: 0, Fold: 5, Log Loss: 0.2553




Total train: 555, Total valid: 62, Bags: 0, Fold: 6, Log Loss: 0.2425




Total train: 556, Total valid: 61, Bags: 0, Fold: 7, Log Loss: 0.1347




Total train: 556, Total valid: 61, Bags: 0, Fold: 8, Log Loss: 0.4505




Total train: 556, Total valid: 61, Bags: 0, Fold: 9, Log Loss: 0.2076
Average Log Loss for Bag 0: 0.2353
########################## bag: 1 ##########################




Total train: 555, Total valid: 62, Bags: 1, Fold: 0, Log Loss: 0.4915




Total train: 555, Total valid: 62, Bags: 1, Fold: 1, Log Loss: 0.1780




Total train: 555, Total valid: 62, Bags: 1, Fold: 2, Log Loss: 0.1840




Total train: 555, Total valid: 62, Bags: 1, Fold: 3, Log Loss: 0.1497




Total train: 555, Total valid: 62, Bags: 1, Fold: 4, Log Loss: 0.2314




Total train: 555, Total valid: 62, Bags: 1, Fold: 5, Log Loss: 0.1831




Total train: 555, Total valid: 62, Bags: 1, Fold: 6, Log Loss: 0.2031




Total train: 556, Total valid: 61, Bags: 1, Fold: 7, Log Loss: 0.1907




Total train: 556, Total valid: 61, Bags: 1, Fold: 8, Log Loss: 0.1532




Total train: 556, Total valid: 61, Bags: 1, Fold: 9, Log Loss: 0.1800
Average Log Loss for Bag 1: 0.2145
########################## bag: 2 ##########################




Total train: 555, Total valid: 62, Bags: 2, Fold: 0, Log Loss: 0.2073




Total train: 555, Total valid: 62, Bags: 2, Fold: 1, Log Loss: 0.1449




Total train: 555, Total valid: 62, Bags: 2, Fold: 2, Log Loss: 0.1889




Total train: 555, Total valid: 62, Bags: 2, Fold: 3, Log Loss: 0.1798




Total train: 555, Total valid: 62, Bags: 2, Fold: 4, Log Loss: 0.2819




Total train: 555, Total valid: 62, Bags: 2, Fold: 5, Log Loss: 0.4101




Total train: 555, Total valid: 62, Bags: 2, Fold: 6, Log Loss: 0.4122




Total train: 556, Total valid: 61, Bags: 2, Fold: 7, Log Loss: 0.2325




Total train: 556, Total valid: 61, Bags: 2, Fold: 8, Log Loss: 0.1047




Total train: 556, Total valid: 61, Bags: 2, Fold: 9, Log Loss: 0.3961
Average Log Loss for Bag 2: 0.2558
########################## bag: 3 ##########################




Total train: 555, Total valid: 62, Bags: 3, Fold: 0, Log Loss: 0.1816




Total train: 555, Total valid: 62, Bags: 3, Fold: 1, Log Loss: 0.3556




Total train: 555, Total valid: 62, Bags: 3, Fold: 2, Log Loss: 0.2346




Total train: 555, Total valid: 62, Bags: 3, Fold: 3, Log Loss: 0.2189




Total train: 555, Total valid: 62, Bags: 3, Fold: 4, Log Loss: 0.1535




Total train: 555, Total valid: 62, Bags: 3, Fold: 5, Log Loss: 0.1841




Total train: 555, Total valid: 62, Bags: 3, Fold: 6, Log Loss: 0.1324




Total train: 556, Total valid: 61, Bags: 3, Fold: 7, Log Loss: 0.2666




Total train: 556, Total valid: 61, Bags: 3, Fold: 8, Log Loss: 0.4547




Total train: 556, Total valid: 61, Bags: 3, Fold: 9, Log Loss: 0.1330
Average Log Loss for Bag 3: 0.2315
########################## bag: 4 ##########################




Total train: 555, Total valid: 62, Bags: 4, Fold: 0, Log Loss: 0.1032




Total train: 555, Total valid: 62, Bags: 4, Fold: 1, Log Loss: 0.1277




Total train: 555, Total valid: 62, Bags: 4, Fold: 2, Log Loss: 0.3344




Total train: 555, Total valid: 62, Bags: 4, Fold: 3, Log Loss: 0.0839




Total train: 555, Total valid: 62, Bags: 4, Fold: 4, Log Loss: 0.2082




Total train: 555, Total valid: 62, Bags: 4, Fold: 5, Log Loss: 0.2133




Total train: 555, Total valid: 62, Bags: 4, Fold: 6, Log Loss: 0.4430




Total train: 556, Total valid: 61, Bags: 4, Fold: 7, Log Loss: 0.1259




Total train: 556, Total valid: 61, Bags: 4, Fold: 8, Log Loss: 0.2431




Total train: 556, Total valid: 61, Bags: 4, Fold: 9, Log Loss: 0.3431
Average Log Loss for Bag 4: 0.2226
Average Log Loss after Full Training: 0.2319


Unnamed: 0,Id,Class_0,Class_1
0,00eed32682bb,0.638554,0.361446
1,010ebe33f668,0.638554,0.361446
2,02fa521e1838,0.638554,0.361446
3,040e15f562a2,0.638554,0.361446
4,046e85c7cc7f,0.638554,0.361446


In [2]:
# Prepare submission dataframe
lgbm_preds = np.zeros(len(test_df))
for bag, weight in zip(range(bag_num), weights):
    for fold in range(n_fold):
        clf = models[bag * n_fold + fold]
        lgbm_preds += weight * clf.predict_proba(test_df_selected)[:, 1] / n_fold

lgbm = test_df[['Id']].copy()
lgbm['Class_0'] = 1 - lgbm_preds
lgbm['Class_1'] = lgbm_preds

# Initialize variables for desired positive samples threshold
desired_positive_samples = 50
best_threshold = 0.5
max_positive_samples = 0

# Calculate the initial number of positive samples for the initial threshold of 0.5
p1 = lgbm['Class_1'].values.reshape(-1, 1)
num_positive_samples = np.sum(p1 > best_threshold)

# Keep adjusting the threshold until we have explored the range [0.5, 0.86] or the number of positive samples exceeds the desired_positive_samples
threshold = 0.5
while threshold <= 0.86 and num_positive_samples <= desired_positive_samples:
    # Check if the current threshold gives more positive samples than the previous best and it is close to 50
    if num_positive_samples >= max_positive_samples:
        max_positive_samples = num_positive_samples
        best_threshold = threshold
    
    # Adjust the threshold
    threshold += 0.01

    # Recalculate the number of positive samples
    num_positive_samples = np.sum(p1 > threshold)

# Apply thresholding to obtain final class predictions
lgbm['Class_0'] = np.where(p1 > best_threshold, 0, 1)
lgbm['Class_1'] = 1 - lgbm['Class_0']

# Save the final submission with thresholding
lgbm[['Id', 'Class_0', 'Class_1']].to_csv('submission.csv', index=False)
best_threshold

0.8500000000000003

In [3]:
lgbm

Unnamed: 0,Id,Class_0,Class_1
0,00eed32682bb,1,0
1,010ebe33f668,1,0
2,02fa521e1838,1,0
3,040e15f562a2,1,0
4,046e85c7cc7f,1,0
