In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score,  cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

print("input files:")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Loading Data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")

In [None]:
print("Shape")
print("train: " + str(train.shape))
print("test: " + str(test.shape))

print("\nColumns: " + str(train.columns.tolist()))

print("\nMissing Values")
print("train: " + str(train.isnull().sum().sum()))
print("test: " + str(test.isnull().sum().sum()))

print("\nData")
train.head()

## EDA

In [None]:
print("Training Data Overview")
train.describe().round(2)

In [None]:
print("Target Variable Distribution")
train['loan_paid_back'].value_counts(normalize=True)

In [None]:
print("Target Variable Correlation with Numerical Features")
numerical_features = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
plt.figure(figsize=(10, 8))
corr_matrix = train[numerical_features + ['loan_paid_back']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## Feature Engineering

In [None]:
def add_features(df):
    temp_df = pd.DataFrame(index=df.index)
    temp_df["loan_to_income_ratio"] = df["loan_amount"] / df["annual_income"]
    temp_df["monthly_payment_est"] = (df["loan_amount"] * df["interest_rate"]) / 12
    temp_df["credit_to_income"] = df["credit_score"] / df["annual_income"]
    temp_df["credit_interest_interaction"] = df["credit_score"] * df["interest_rate"]
    new_df = pd.concat([df, temp_df], axis=1)
    return new_df

train_eng = add_features(train)
test_eng = add_features(test)

## Preprocessing

In [None]:
def preprocess(train, test):
    categorical_cols = ['gender', 'marital_status', 'education_level', 
                        'employment_status', 'loan_purpose', 'grade_subgrade']

    y_train = train['loan_paid_back'].copy()
    train = train.drop(columns=['loan_paid_back'])
    
    combined = pd.concat([train, test], axis=0, ignore_index=True)
    
    label_encoders = {}
    for col in categorical_cols:
        if col in combined.columns:
            le = LabelEncoder()
            combined[col] = le.fit_transform(combined[col].astype(str))
            label_encoders[col] = le
            
    train_processed = combined.iloc[:len(train)].copy()
    test_processed = combined.iloc[len(train):].copy()
            
    X_train = train_processed.drop(columns=['id'], axis=1)
    X_test = test_processed.drop(columns=['id'], axis=1)
            
    return X_train, y_train, X_test, label_encoders

X_train, y_train, X_test, label_encoders = preprocess(train_eng, test_eng)

## Model Evaluation

In [None]:
# define models
base_models = {
    "Random Forest": RandomForestClassifier(n_estimators=100,
                                            max_depth=12,
                                            n_jobs=-1,
                                            random_state=42
                                           ),
    "XG Boost": xgb.XGBClassifier(use_label_encoder=False,
                                  eval_metric='logloss',
                                  n_jobs=-1
                                 )
}

# define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# evaluate base models and collect predictions
meta_features_train = {}
results = []
for name, model in base_models.items():
    oof_pred = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba', n_jobs=-1)[:, 1]
    meta_features_train[name] = oof_pred 
    auc = roc_auc_score(y_train, oof_pred)
    results.append({'Model': name, 'OOF AUC': auc})

# Display results
results_df = pd.DataFrame(results).sort_values(by='OOF AUC', ascending=False)
results_df.head()

In [None]:
# fit meta model
meta_X_train = pd.DataFrame(meta_features_train)
meta_model = LogisticRegression(random_state=42, C=1.0)
meta_model.fit(meta_X_train, y_train)

## Final Predictions and Submission

In [None]:
# retrain base models on full training set
rf_model = base_models["Random Forest"]
xgb_model = base_models["XG Boost"]

rf_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# make base predictions on test set
# create meta features DataFrame from base predictions
rf_pred = rf_model.predict_proba(X_test)[:, 1]
xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
meta_X_test = pd.DataFrame({"Random Forest": rf_pred, "XG Boost": xgb_pred})

# fit meta model to meta features and training target variables
final_prob = meta_model.predict_proba(meta_X_test)[:, 1]

# use meta model to predict
test["loan_paid_back"] = final_prob

submission = test[["id", "loan_paid_back" ]].copy()
submission.head()

In [None]:
submission.to_csv("/kaggle/working/submission.csv", index=False)