In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv


In [2]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# --- Load Data ---
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
sample = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

try:
    orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')
    print(f"Original dataset loaded: {orig.shape}")
    use_orig = True
except:
    print("Original dataset not found, proceeding without it")
    use_orig = False

print(f"Train: {train.shape} | Test: {test.shape}")

target = 'loan_paid_back'
id_col = 'id'

Original dataset not found, proceeding without it
Train: (593994, 13) | Test: (254569, 12)


In [3]:
def create_features(df):
    """Comprehensive feature engineering without target leakage"""
    
    # Extract grade components
    df['grade_letter'] = df['grade_subgrade'].str[0]
    df['subgrade_number'] = df['grade_subgrade'].str[1:].astype(int)
    
    # Grade mapping
    grade_weight = {'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3, 'F': 2, 'G': 1}
    df['grade_numeric'] = df['grade_letter'].map(grade_weight)
    df['grade_score'] = df['grade_numeric'] * 10 + df['subgrade_number']
    
    # Financial ratios (core risk indicators)
    df['income_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
    df['credit_income_ratio'] = df['credit_score'] / (df['annual_income'] / 10000 + 1)
    df['monthly_income'] = df['annual_income'] / 12
    df['debt_payment'] = df['monthly_income'] * df['debt_to_income_ratio']
    df['disposable_income'] = df['monthly_income'] - df['debt_payment']
    df['monthly_payment_est'] = (df['loan_amount'] * df['interest_rate'] / 1200) / (df['monthly_income'] + 1)
    
    # Risk metrics
    df['risk_score'] = (df['debt_to_income_ratio'] * 0.4 + 
                        (850 - df['credit_score']) / 850 * 0.3 + 
                        df['interest_rate'] / 30 * 0.3)
    df['credit_per_income'] = df['credit_score'] / (df['annual_income'] / 50000)
    df['debt_burden'] = df['debt_to_income_ratio'] * df['loan_amount']
    
    # Log transformations (handle skewness)
    df['log_income'] = np.log1p(df['annual_income'])
    df['log_loan'] = np.log1p(df['loan_amount'])
    df['log_credit'] = np.log1p(df['credit_score'])
    
    # Interaction terms
    df['income_credit_interact'] = df['annual_income'] * df['credit_score'] / 1e8
    df['loan_rate_interact'] = df['loan_amount'] * df['interest_rate'] / 1000
    
    # Binning (creates non-linear features)
    df['credit_bin'] = pd.cut(df['credit_score'], bins=[0, 600, 700, 750, 800, 850], labels=False).fillna(0)
    df['income_bin'] = pd.cut(df['annual_income'], bins=[0, 40000, 70000, 100000, 150000, np.inf], labels=False).fillna(0)
    df['debt_bin'] = pd.cut(df['debt_to_income_ratio'], bins=[0, 0.2, 0.35, 0.5, 0.7, 1.0], labels=False).fillna(0)
    # Quantile features for numerical columns
    for col in ['annual_income', 'loan_amount', 'credit_score']:
        for q in [5, 10]:
            try:
                df[f'{col}_qbin{q}'] = pd.qcut(df[col], q=q, labels=False, duplicates='drop').fillna(0)
            except:
                df[f'{col}_qbin{q}'] = 0
    
    return df

# Apply feature engineering
train = create_features(train)
test = create_features(test)
if use_orig:
    orig = create_features(orig)

In [4]:
def target_encode_oof(train_df, test_df, cols, target_col, n_splits=10):
    """Out-of-fold target encoding to prevent leakage"""
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    for col in cols:
        # OOF for train
        train_encoded = np.zeros(len(train_df))
        for tr_idx, val_idx in kf.split(train_df, train_df[target_col]):
            mean_map = train_df.iloc[tr_idx].groupby(col)[target_col].mean()
            train_encoded[val_idx] = train_df.iloc[val_idx][col].map(mean_map)
        train_df[f'{col}_te'] = train_encoded
        
        # Global mean for test
        global_mean = train_df.groupby(col)[target_col].mean()
        test_df[f'{col}_te'] = test_df[col].map(global_mean).fillna(train_df[target_col].mean())
    
    return train_df, test_df

# Categorical columns for encoding
cat_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 
            'loan_purpose', 'grade_subgrade', 'grade_letter']

train, test = target_encode_oof(train, test, cat_cols, target, n_splits=10)

for col in cat_cols:
    freq = train[col].value_counts(normalize=True)
    train[f'{col}_freq'] = train[col].map(freq)
    test[f'{col}_freq'] = test[col].map(freq).fillna(freq.mean())

In [5]:
# Define feature sets
numerical_base = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
                  'loan_amount', 'interest_rate']

engineered = ['income_loan_ratio', 'credit_income_ratio', 'monthly_income',
              'debt_payment', 'disposable_income', 'monthly_payment_est',
              'risk_score', 'credit_per_income', 'debt_burden',
              'log_income', 'log_loan', 'log_credit',
              'income_credit_interact', 'loan_rate_interact',
              'grade_numeric', 'subgrade_number', 'grade_score',
              'credit_bin', 'income_bin', 'debt_bin']

target_encoded = [f'{col}_te' for col in cat_cols]
freq_encoded = [f'{col}_freq' for col in cat_cols]

# Combine all features
all_features = numerical_base + engineered + target_encoded + freq_encoded

# Add quantile bins
qbin_features = [col for col in train.columns if '_qbin' in col]
all_features.extend(qbin_features)

# Prepare X, y
X = train[all_features].copy()
y = train[target].values
X_test = test[all_features].copy()
test_ids = test[id_col].copy()


In [6]:
N_FOLDS = 7
SEED = 42

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# Store OOF predictions and test predictions
oof_preds = np.zeros(len(X))
test_preds_xgb = np.zeros(len(X_test))
test_preds_lgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

fold_scores = []

# XGBoost parameters (optimized)
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.02,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'colsample_bylevel': 0.7,
    'min_child_weight': 5,
    'reg_lambda': 3.0,
    'reg_alpha': 1.5,
    'gamma': 0.5,
    'tree_method': 'hist',
    'random_state': SEED,
    'n_estimators': 800
}

# LightGBM parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'min_data_in_leaf': 100,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'reg_lambda': 2.0,
    'reg_alpha': 1.0,
    'random_state': SEED,
    'n_estimators': 800,
    'verbose': -1
}

# CatBoost parameters
cat_params = {
    'iterations': 800,
    'learning_rate': 0.02,
    'depth': 6,
    'l2_leaf_reg': 5,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'random_seed': SEED,
    'verbose': False
}

In [7]:
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
    print(f"\n*** Fold {fold}/{N_FOLDS} ***")
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Add original data if available
    if use_orig:
        X_orig = orig[all_features]
        y_orig = orig[target].values
        X_train = pd.concat([X_train, X_orig], axis=0, ignore_index=True)
        y_train = np.concatenate([y_train, y_orig])
    
    # XGBoost
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    xgb_pred = xgb_model.predict_proba(X_val)[:, 1]
    test_preds_xgb += xgb_model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # LightGBM
    lgb_model = LGBMClassifier(**lgb_params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    lgb_pred = lgb_model.predict_proba(X_val)[:, 1]
    test_preds_lgb += lgb_model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # CatBoost
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val))
    cat_pred = cat_model.predict_proba(X_val)[:, 1]
    test_preds_cat += cat_model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    # Ensemble OOF predictions (weighted average)
    fold_pred = 0.4 * xgb_pred + 0.3 * lgb_pred + 0.3 * cat_pred
    oof_preds[val_idx] = fold_pred
    
    fold_auc = roc_auc_score(y_val, fold_pred)
    fold_scores.append(fold_auc)
    print(f"Fold {fold} AUC: {fold_auc:.5f} (XGB: {roc_auc_score(y_val, xgb_pred):.5f}, LGB: {roc_auc_score(y_val, lgb_pred):.5f}, CAT: {roc_auc_score(y_val, cat_pred):.5f})")
    
    del X_train, X_val, y_train, y_val
    gc.collect()


*** Fold 1/7 ***
Fold 1 AUC: 0.92148 (XGB: 0.92091, LGB: 0.92292, CAT: 0.91920)

*** Fold 2/7 ***
Fold 2 AUC: 0.92034 (XGB: 0.91952, LGB: 0.92198, CAT: 0.91804)

*** Fold 3/7 ***
Fold 3 AUC: 0.91996 (XGB: 0.91923, LGB: 0.92134, CAT: 0.91782)

*** Fold 4/7 ***
Fold 4 AUC: 0.91896 (XGB: 0.91827, LGB: 0.92038, CAT: 0.91682)

*** Fold 5/7 ***
Fold 5 AUC: 0.91841 (XGB: 0.91768, LGB: 0.91992, CAT: 0.91625)

*** Fold 6/7 ***
Fold 6 AUC: 0.92044 (XGB: 0.91954, LGB: 0.92199, CAT: 0.91839)

*** Fold 7/7 ***
Fold 7 AUC: 0.90837 (XGB: 0.88295, LGB: 0.92120, CAT: 0.91746)


In [8]:
overall_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall OOF AUC: {overall_auc:.5f}")
print(f"Mean Fold AUC: {np.mean(fold_scores):.5f} ± {np.std(fold_scores):.5f}")

# Ensemble test predictions (weighted average)
test_preds_final = 0.4 * test_preds_xgb + 0.3 * test_preds_lgb + 0.3 * test_preds_cat


Overall OOF AUC: 0.91591
Mean Fold AUC: 0.91828 ± 0.00415


In [11]:
# --- Predict on Test ---
submission = pd.DataFrame({
    'id': test_ids,
    target: test_preds_final
})
submission.to_csv('submission.csv', index=False)