In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s6e2/sample_submission.csv
/kaggle/input/playground-series-s6e2/train.csv
/kaggle/input/playground-series-s6e2/test.csv


In [3]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn for preprocessing and evaluation
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer

# Models
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [4]:
# Load the dataset
df = pd.read_csv('/kaggle/input/playground-series-s6e2/train.csv')

# 1. Cleaning: Drop unnecessary ID column
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# 2. Cleaning: Remove Duplicates
initial_len = len(df)
df = df.drop_duplicates()
print(f"Removed {initial_len - len(df)} duplicate rows.")

# 3. Cleaning: Handle Null Values
# Although your dataset seems clean, this is a robust step for production code.
# We separate features and target first to avoid imputing the target.
target_col = 'Heart Disease'
features = [c for c in df.columns if c != target_col]

# Impute missing values (using mean for simplicity, or median/mode)
imputer = SimpleImputer(strategy='mean')
df[features] = imputer.fit_transform(df[features])

print("Data Shape after cleaning:", df.shape)
df.head()

Removed 0 duplicate rows.
Data Shape after cleaning: (630000, 14)


Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,58.0,1.0,4.0,152.0,239.0,0.0,0.0,158.0,1.0,3.6,2.0,2.0,7.0,Presence
1,52.0,1.0,1.0,125.0,325.0,0.0,2.0,171.0,0.0,0.0,1.0,0.0,3.0,Absence
2,56.0,0.0,2.0,160.0,188.0,0.0,2.0,151.0,0.0,0.0,1.0,0.0,3.0,Absence
3,44.0,0.0,3.0,134.0,229.0,0.0,2.0,150.0,0.0,1.0,2.0,0.0,3.0,Absence
4,58.0,1.0,4.0,140.0,234.0,0.0,2.0,125.0,1.0,3.8,2.0,3.0,3.0,Presence


In [5]:
# 1. Identify Feature Types
# Based on the dataset: Age, BP, Cholesterol, Max HR, ST depression are continuous.
# The rest are categorical/ordinal integers.
num_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
cat_cols = [c for c in features if c not in num_cols]

print(f"Numerical Features: {num_cols}")
print(f"Categorical Features: {cat_cols}")

# 2. Feature Scaling
# We scale numerical features so they have mean 0 and variance 1.
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# 3. Target Encoding
# Convert 'Presence'/'Absence' to 1/0
le = LabelEncoder()
df[target_col] = le.fit_transform(df[target_col])

print(f"Target Classes: {le.classes_}")
# X and y for training
X = df[features]
y = df[target_col]

Numerical Features: ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']
Categorical Features: ['Sex', 'Chest pain type', 'FBS over 120', 'EKG results', 'Exercise angina', 'Slope of ST', 'Number of vessels fluro', 'Thallium']
Target Classes: ['Absence' 'Presence']


In [6]:
# Configuration for Cross-Validation
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Arrays to store Out-of-Fold (OOF) predictions for the entire dataset
oof_preds_xgb = np.zeros(len(X))
oof_preds_lgb = np.zeros(len(X))
oof_preds_cb = np.zeros(len(X))
oof_preds_ensemble = np.zeros(len(X))

# Lists to store log loss scores per fold
scores_xgb = []
scores_lgb = []
scores_cb = []
scores_ensemble = []

print(f"Starting Training with {N_SPLITS}-Fold Cross Validation...\n")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # --- Model 1: XGBoost ---
    model_xgb = xgb.XGBClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        eval_metric='logloss',
        early_stopping_rounds=50,
        random_state=42,
        use_label_encoder=False
    )
    model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    val_pred_xgb = model_xgb.predict_proba(X_val)[:, 1]
    oof_preds_xgb[val_idx] = val_pred_xgb
    loss_xgb = log_loss(y_val, val_pred_xgb)
    scores_xgb.append(loss_xgb)

    # --- Model 2: LightGBM ---
    model_lgb = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        random_state=42,
        verbosity=-1
    )
    # LightGBM requires early stopping via callbacks in newer versions or explicit param
    callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=False)]
    model_lgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
    val_pred_lgb = model_lgb.predict_proba(X_val)[:, 1]
    oof_preds_lgb[val_idx] = val_pred_lgb
    loss_lgb = log_loss(y_val, val_pred_lgb)
    scores_lgb.append(loss_lgb)

    # --- Model 3: CatBoost ---
    model_cb = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric='Logloss',
        random_seed=42,
        verbose=0,
        allow_writing_files=False
    )
    model_cb.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    val_pred_cb = model_cb.predict_proba(X_val)[:, 1]
    oof_preds_cb[val_idx] = val_pred_cb
    loss_cb = log_loss(y_val, val_pred_cb)
    scores_cb.append(loss_cb)

    # --- Ensemble: Simple Probability Averaging ---
    val_pred_ensemble = (val_pred_xgb + val_pred_lgb + val_pred_cb) / 3
    oof_preds_ensemble[val_idx] = val_pred_ensemble
    loss_ensemble = log_loss(y_val, val_pred_ensemble)
    scores_ensemble.append(loss_ensemble)

    print(f"Fold {fold+1} | XGB: {loss_xgb:.5f} | LGB: {loss_lgb:.5f} | CB: {loss_cb:.5f} | Ensemble: {loss_ensemble:.5f}")

print("\n" + "="*50)
print(f"Mean Log-Loss XGBoost:  {np.mean(scores_xgb):.5f}")
print(f"Mean Log-Loss LightGBM: {np.mean(scores_lgb):.5f}")
print(f"Mean Log-Loss CatBoost: {np.mean(scores_cb):.5f}")
print(f"Mean Log-Loss Ensemble: {np.mean(scores_ensemble):.5f}")
print("="*50)

Starting Training with 5-Fold Cross Validation...

Fold 1 | XGB: 0.26777 | LGB: 0.26766 | CB: 0.26665 | Ensemble: 0.26700
Fold 2 | XGB: 0.27073 | LGB: 0.27075 | CB: 0.26976 | Ensemble: 0.27007
Fold 3 | XGB: 0.26813 | LGB: 0.26827 | CB: 0.26726 | Ensemble: 0.26752
Fold 4 | XGB: 0.26974 | LGB: 0.26976 | CB: 0.26870 | Ensemble: 0.26900
Fold 5 | XGB: 0.26718 | LGB: 0.26709 | CB: 0.26607 | Ensemble: 0.26642

Mean Log-Loss XGBoost:  0.26871
Mean Log-Loss LightGBM: 0.26871
Mean Log-Loss CatBoost: 0.26769
Mean Log-Loss Ensemble: 0.26800


In [7]:
# --- 1. Load Data ---
test_df = pd.read_csv('/kaggle/input/playground-series-s6e2/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s6e2/sample_submission.csv')

# --- 2. Preprocessing Test Data ---
# We must apply the exact same steps as we did for the training data
# Store IDs for submission
test_ids = test_df['id']

# Drop ID column from features
if 'id' in test_df.columns:
    test_df = test_df.drop('id', axis=1)

# Handle Missing Values (using the same imputer as train)
# Note: Ideally, you should fit the imputer on train and transform test.
# Here, for simplicity in this section, we assume the 'imputer' and 'scaler' 
# from the previous section are available. 
# If not, we re-initialize them for demonstration (in a real pipeline, reuse the fitted objects).
imputer = SimpleImputer(strategy='mean')
# Fit on the full training features (X) to ensure we learn from the whole dataset
imputer.fit(X) 
test_df[features] = imputer.transform(test_df[features])

# Scale Numerical Features (using the scaler fitted on train)
scaler = StandardScaler()
scaler.fit(X[num_cols]) # Re-fitting on full X just to be safe and robust
test_df[num_cols] = scaler.transform(test_df[num_cols])

# --- 3. K-Fold Prediction with CatBoost ---
# We use the same K-Fold split to train 5 models and average their predictions
test_preds_cumulative = np.zeros(len(test_df))

print(f"Starting Prediction with {N_SPLITS}-Fold CatBoost Ensemble...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    # Initialize CatBoost
    model_cb = cb.CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric='Logloss',
        random_seed=42,
        verbose=0,
        allow_writing_files=False
    )
    
    # Train on the fold
    model_cb.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
    
    # Predict on the TEST set (probabilities for class 1)
    fold_preds = model_cb.predict_proba(test_df)[:, 1]
    
    # Add to cumulative predictions
    test_preds_cumulative += fold_preds
    
    print(f"Fold {fold+1} prediction completed.")

# Average the predictions across all folds
test_preds_avg = test_preds_cumulative / N_SPLITS

# --- 4. Create Submission File ---
submission = pd.DataFrame({
    'id': test_ids,
    'Heart Disease': test_preds_avg
})

# Save to CSV
submission_filename = 'submission.csv'
submission.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully!")
print(submission.head())

Starting Prediction with 5-Fold CatBoost Ensemble...
Fold 1 prediction completed.
Fold 2 prediction completed.
Fold 3 prediction completed.
Fold 4 prediction completed.
Fold 5 prediction completed.

Submission file 'submission.csv' created successfully!
       id  Heart Disease
0  630000       0.535369
1  630001       0.003979
2  630002       0.789427
3  630003       0.006078
4  630004       0.304001
