In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import joblib
import pandas as pd

lgbm = joblib.load("lgbm_model.pkl")
preprocessor = joblib.load("preprocessor.pkl")

In [6]:
test_df = pd.read_csv("test.csv")


In [None]:
def feature_engineering(df, is_train=True):
    df = df.copy()

    # ========================
    # 1. Ratio / Burden features
    # ========================

    eps = 1e-6
    df["loan_to_income"] = df["loan_amount"] / (df["annual_income"] + eps)

    df["loan_per_credit"] = df["loan_amount"] / (df["credit_score"] + eps)

    # ========================
    # 2. Interest burden
    # ========================
    df["interest_burden"] = df["loan_amount"] * df["interest_rate"] / 100

    df["interest_income_ratio"] = (
        df["interest_burden"] / (df["annual_income"] + eps)
    )


    # =======================
    # 2. Grade risk mapping
    # ========================
    grade_risk_map = {
        'A1': 1.0, 'A2': 1.2, 'A3': 1.4, 'A4': 1.6, 'A5': 1.8,
        'B1': 2.0, 'B2': 2.2, 'B3': 2.4, 'B4': 2.6, 'B5': 2.8,
        'C1': 3.0, 'C2': 3.2, 'C3': 3.4, 'C4': 3.6, 'C5': 3.8,
        'D1': 4.0, 'D2': 4.2, 'D3': 4.4, 'D4': 4.6, 'D5': 4.8,
        'E1': 5.0, 'E2': 5.2, 'E3': 5.4, 'E4': 5.6, 'E5': 5.8,
        'F1': 6.0, 'F2': 6.2, 'F3': 6.4, 'F4': 6.6, 'F5': 6.8
    }
    df["grade_risk"] = df["grade_subgrade"].map(grade_risk_map)
    df["grade_risk"] = df["grade_risk"].astype("float32")
    purpose_map = {
        "Home": "low",
        "Business": "low",

        "Car": "medium",
        "Other": "medium",
        "Debt consolidation": "medium",
        "Vacation": "medium",

        "Education": "high",
        "Medical": "high"
    }

    df["loan_purpose_group"] = df["loan_purpose"].map(purpose_map)


    return df


In [8]:
df_test = feature_engineering(test_df)

In [9]:
num_features = [
    "annual_income",
    "debt_to_income_ratio",
    "credit_score",
    "loan_amount",
    "interest_rate",

    "loan_to_income",
    "interest_burden",
    "loan_per_credit",
    "interest_income_ratio",
    'grade_risk',
]

cat_onehot_features = ['employment_status', 'loan_purpose_group','education_level', 'marital_status']

features = num_features + cat_onehot_features
label = "loan_paid_back"

In [10]:
X_test = df_test[features]   
X_test_proc = preprocessor.transform(X_test)


In [11]:
y_test_proba = lgbm.predict_proba(X_test_proc)[:, 1]




In [12]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "loan_paid_back": y_test_proba
})

submission.to_csv("submission3.csv", index=False)