In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import StackingClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

# Palette
sns.set_palette("icefire")

# Load and Overview


In [None]:
train = pd.read_csv("kaggle/input/playground-series-s5e11/train.csv", index_col=0)
test = pd.read_csv("kaggle/input/playground-series-s5e11/test.csv", index_col=0)
# orig = pd.read_csv("/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv")

for col in train.select_dtypes(include="object").columns:
            train[col] = train[col].astype("category")
for col in test.select_dtypes(include="object").columns:
            test[col] = test[col].astype("category")
    
print(
    "Missing Train: ",
    train.isnull().sum().sum(),
    " - Missing Test: ",
    test.isnull().sum().sum(),
    end="\n\n",
)
train.head(3)

In [None]:
train.describe(include="all")

# Distribution

In [None]:
train.info()

In [None]:
for col in train.select_dtypes(include="category").columns.to_list() + ["loan_paid_back"]:
    sns.countplot(data=train, x=train[col], palette="icefire")
    plt.xticks(rotation=45, ha="right")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
class Features(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.risk = None

    def fit(self, df, y=None):
        df = df.copy()
        if isinstance(y, pd.Series):
            y = y.copy()
        else:
            y = pd.Series(data=y, index=df.index, name="y")

        if y is not None:
            df["loan_paid_back"] = y.values
            self.risk = df.groupby("employment_status")["loan_paid_back"].mean()

        self.debt_to_income_ratio_mean = df["debt_to_income_ratio"].mean()

        return self

    def transform(self, df):
        df = df.copy()

        df["annual_incomeXcredit_score"] = df["annual_income"] * df["credit_score"]
        df["loan_to_income"] = df["loan_amount"] / (df["annual_income"] + 1)
        df["interest_burden"] = (df["loan_amount"] * df["interest_rate"]) / (
            df["annual_income"] + 1
        )
        df["log_income"] = np.log1p(df["annual_income"])
        df["log_loan_amount"] = np.log1p(df["loan_amount"])

        df["debt_to_income_ratio_diff"] = (
            df["debt_to_income_ratio"] - self.debt_to_income_ratio_mean
        )
        df["debt_to_income_ratio_norm"] = (
            df["debt_to_income_ratio"] / self.debt_to_income_ratio_mean
        )

        if self.risk is not None:
            df["risk_map"] = df["employment_status"].map(self.risk).astype(float)

        df["credit_dti_interaction"] = df["credit_score"] / (
            df["debt_to_income_ratio"] + 1
        )
        df["income_dti_interaction"] = df["annual_income"] / (
            df["debt_to_income_ratio"] + 1
        )

        df = df.drop(
            [
                "loan_purpose",
                "gender",
                "education_level",
                "marital_status",
            ],
            axis=1,
        )

        return df

In [None]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]
X_pred = test

FeatureFrame = Features().fit_transform(X, y)
catFeatures = FeatureFrame.select_dtypes(include="category").columns.to_list()
numFeatures = FeatureFrame.select_dtypes(exclude="category").columns.to_list()

In [None]:
yn = y.to_numpy()
ys = pd.Series(data=yn, index=X.index, name="y")
ys

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

xgb = Pipeline([
    ("feature", Features()),
    ("xgb",XGBClassifier(
        random_state=42,
        n_estimators=1500,
        learning_rate=0.03,
        enable_categorical=True,
        verbose=False,
        device="cuda",
        n_job=-1,
    )),
])

cat = Pipeline([
    ("feature", Features()),
    ("cat", CatBoostClassifier(
        random_state=42,
        n_estimators=1500,
        learning_rate=0.03,
        cat_features=catFeatures,
        verbose=False,
        task_type="GPU",
    )),
])

lgb = Pipeline([
    ("feature", Features()),
    ("lgb", LGBMClassifier(
        random_state=42, 
        n_estimators=1500, 
        learning_rate=0.03, 
        verbose=-1,
        device="gpu",
        n_job=-1,
    )),
])

In [None]:
score_xgb = cross_val_score(xgb, X, y, cv=cv, scoring="roc_auc")
print(
    f"xgb scores across folds: {score_xgb} - Mean Score: {score_xgb.mean():0.4f}"
)

score_cat = cross_val_score(cat, X, y, cv=cv, scoring="roc_auc")
print(
    f"cat scores across folds: {score_cat} - Mean Score: {score_cat.mean():0.4f}"
)

score_lgb = cross_val_score(lgb, X, y, cv=cv, scoring="roc_auc")
print(
    f"lgb scores across folds: {score_lgb} - Mean Score: {score_lgb.mean():0.4f}",
    end="\n\n",
)

score_xgb = np.mean([score_xgb, score_cat, score_lgb], axis=0)
print(f"Overall Performance: {score_xgb.mean()}")

In [None]:
total = score_xgb.mean() + score_cat.mean() + score_lgb.mean()
xgb_weight = score_xgb.mean() / total
cat_weight = score_cat.mean() / total
lgb_weight = score_lgb.mean() / total
print(xgb_weight, cat_weight, lgb_weight)

In [None]:
xgb.fit(X, y)
cat.fit(X, y)
lgb.fit(X, y)

In [None]:
importantFeatures = pd.DataFrame(
    data={
        "xgb": xgb.named_steps["xgb"].feature_importances_ / xgb.named_steps["xgb"].feature_importances_.max(),
        "cat": cat.named_steps["cat"].feature_importances_ / cat.named_steps["cat"].feature_importances_.max(),
        "lgb": lgb.named_steps["lgb"].feature_importances_ / lgb.named_steps["lgb"].feature_importances_.max(),
    },
    index = Features().fit_transform(X, y).columns
)
importantFeatures["Overall"] = importantFeatures.mean(axis=1)
importantFeatures = (importantFeatures.sort_values("Overall", ascending=False) * 100).round(2)
importantFeatures

In [None]:
xgb_pred = cross_val_predict(xgb, X, y, cv=cv, method="predict_proba")

In [None]:
treePredictions = pd.DataFrame(
    data={
        "xgbPrediction": cross_val_predict(xgb, X, y, cv=cv, method="predict_proba")[:,1],
        "catPrediction": cross_val_predict(cat, X, y, cv=cv, method="predict_proba")[:,1],
        "lgbPrediction": cross_val_predict(lgb, X, y, cv=cv, method="predict_proba")[:,1],
    },
    index=FeatureFrame.index,
)
treePredictions.head(10)

In [None]:
treePredictions["WeightedPrediction"] = treePredictions["xgb"] * xgb_weight + treePredictions["cat"] * cat_weight + treePredictions["lgb"] * lgb_weight
treePredictions.head(10)

In [None]:
ensemblePredictions = pd.DataFrame(
    data={
        "xgbPrediction": xgb.predict_proba(X_pred)[:,1],
        "catPrediction": cat.predict_proba(X_pred)[:,1],
        "lgbPrediction": lgb.predict_proba(X_pred)[:,1],
    },
    index=X_pred.index,
)
ensemblePredictions["Loan_paid_back"] = (ensemblePredictions["xgb"] * xgb_weight + ensemblePredictions["cat"] * cat_weight + ensemblePredictions["lgb"] * lgb_weight).round(0)

In [None]:
ensemblePredictions = ensemblePredictions["Loan_paid_back"]
ensemblePredictions["Loan_paid_back"].to_csv("PlayGround/S5E11/kaggle/working" + "submission.csv")