In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import clone

from sklearn.metrics import (
    roc_auc_score, roc_curve,
    precision_score, recall_score, f1_score,
    accuracy_score,
    precision_recall_curve, auc
)

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
import optuna

EPS = 1e-3

def signed_log1p(data):
    return np.sign(data) * np.log1p(np.abs(data))


def add_bins(df, column, bins, fmt="num"):
    if column not in df.columns:
        return
    labels = []
    for i in range(len(bins) - 1):
        lo, hi = bins[i], bins[i + 1]
        if np.isinf(hi):
            labels.append(f"{lo}{'+' if fmt == 'num' else ''}")
        else:
            labels.append(f"{lo}-{hi}")
    df[f"{column}Band"] = (
        pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)
        .astype(str)
    )


def augment_features(df):
    df["InterestRateSpread"] = df["InterestRate"] - df["BaseInterestRate"]
    df["LoanToIncome"] = df["LoanAmount"] / (df["AnnualIncome"] + EPS)

    total_debt = df["MonthlyLoanPayment"] + df["MonthlyDebtPayments"]
    df["DebtServiceRatio"] = total_debt / (df["MonthlyIncome"] + EPS)
    df["DisposableIncome"] = df["MonthlyIncome"] - total_debt
    df["AssetCoverage"] = df["TotalAssets"] / (df["TotalLiabilities"] + EPS)

    df["LiabilityGap"] = df["TotalLiabilities"] - df["TotalAssets"]
    df["SignedLogLiabilityGap"] = signed_log1p(df["LiabilityGap"])
    df.drop(columns=["LiabilityGap"], inplace=True)

    df["NetWorthToLiabilities"] = df["NetWorth"] / (df["TotalLiabilities"] + EPS)
    df["NetWorthToIncome"] = df["NetWorth"] / (df["AnnualIncome"] + EPS)
    df["UtilizationPerLine"] = df["CreditCardUtilizationRate"] / (df["NumberOfOpenCreditLines"] + 1)
    df["InquiryPerLine"] = df["NumberOfCreditInquiries"] / (df["NumberOfOpenCreditLines"] + 1)
    df["IncomePerDependent"] = df["AnnualIncome"] / (df["NumberOfDependents"] + 1)
    df["ExperienceToAge"] = df["Experience"] / (df["Age"] + EPS)
    df["LoanDurationYears"] = df["LoanDuration"] / 12.0
    df["CreditHistoryToAge"] = df["LengthOfCreditHistory"] / (df["Age"] + EPS)
    df["IncomeDiscrepancy"] = df["AnnualIncome"] - (df["MonthlyIncome"] * 12.0)
    df["AgeAfterExperience"] = df["Age"] - df["Experience"]

    parsed = pd.to_datetime(df["ApplicationDate"], errors="coerce")
    df["ApplicationDateWeek"] = parsed.dt.isocalendar().week.astype(float)
    df["ApplicationDateDayOfYear"] = parsed.dt.dayofyear
    df["ApplicationDateQuarter"] = parsed.dt.quarter

    df["CreditScore_2"] = df["CreditScore"] ** 2
    df["Age_2"] = df["Age"] ** 2
    df["SqrtAnnualIncome"] = np.sqrt(np.abs(df["AnnualIncome"]) + EPS)
    df["SqrtLoanAmount"] = np.sqrt(np.abs(df["LoanAmount"]) + EPS)
    df["SqrtMonthlyIncome"] = np.sqrt(np.abs(df["MonthlyIncome"]) + EPS)
    df["LogCreditScore"] = np.log1p(df["CreditScore"])
    df["LogExperience"] = np.log1p(df["Experience"])
    df["LogAge"] = np.log1p(df["Age"])
    df["LogAnnualIncome"] = np.log1p(df["AnnualIncome"] + EPS)

    df["CreditScore_LoanToIncome"] = df["CreditScore"] * df["LoanToIncome"]
    df["CreditScore_DebtRatio"] = df["CreditScore"] * df["TotalDebtToIncomeRatio"]
    df["Age_DebtRatio"] = df["Age"] * df["TotalDebtToIncomeRatio"]

    df["GoodCreditScore"] = (df["CreditScore"] >= 700).astype(float)
    df["ExcellentCreditScore"] = (df["CreditScore"] >= 750).astype(float)
    df["HighDebtRatio"] = (df["TotalDebtToIncomeRatio"] > 0.4).astype(float)
    df["HighUtilization"] = (df["CreditCardUtilizationRate"] > 0.7).astype(float)
    df["YoungAge"] = (df["Age"] < 30).astype(float)
    df["ExperiencedWorker"] = (df["Experience"] >= 10).astype(float)

    add_bins(df, "CreditScore", [300, 500, 620, 700, 750, 800, 900])
    add_bins(df, "AnnualIncome", [0, 30000, 75000, 125000, 200000, np.inf])
    add_bins(df, "TotalDebtToIncomeRatio", [0.0, 0.25, 0.4, 0.7, np.inf])


def my_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()


def my_precision(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    return tp / (tp + fp + 1e-12)


def my_recall(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp / (tp + fn + 1e-12)


def my_f1(y_true, y_pred):
    p = my_precision(y_true, y_pred)
    r = my_recall(y_true, y_pred)
    return 2 * p * r / (p + r + 1e-12)


def my_roc_auc(y_true, y_score):
    return roc_auc_score(y_true, y_score)


def my_pr_auc(y_true, y_score):
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    return auc(recall, precision)


class MyBagging:
    def __init__(self, base_estimator, n_estimators=10, max_samples=0.8, random_state=42):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.random_state = random_state
        self.models_ = []

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        rng = np.random.RandomState(self.random_state)
        n_samples = X.shape[0]
        self.models_ = []

        for i in range(self.n_estimators):
            idx = rng.choice(n_samples, int(self.max_samples * n_samples), replace=True)
            model = clone(self.base_estimator)
            model.fit(X[idx], y[idx])
            self.models_.append(model)
        return self


    def predict_proba(self, X):
        probs = np.array([m.predict_proba(X)[:, 1] for m in self.models_])
        proba_mean = probs.mean(axis=0)
        return np.vstack([1 - proba_mean, proba_mean]).T

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X)[:, 1] >= threshold).astype(int)

class MyGradientBoosting:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees_ = []
        self.init_score_ = None

    def fit(self, X, y):
        y = y.astype(float)
        p = np.clip(y.mean(), 1e-6, 1 - 1e-6)
        self.init_score_ = np.log(p / (1 - p))  # logit
        F = np.full(len(y), self.init_score_)
        self.trees_ = []

        for _ in range(self.n_estimators):
            p = 1 / (1 + np.exp(-F))
            residual = y - p
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residual)
            self.trees_.append(tree)
            F += self.learning_rate * tree.predict(X)

        return self

    def _score(self, X):
        F = np.full(X.shape[0], self.init_score_)
        for tree in self.trees_:
            F += self.learning_rate * tree.predict(X)
        return 1 / (1 + np.exp(-F))

    def predict_proba(self, X):
        proba1 = self._score(X)
        return np.vstack([1 - proba1, proba1]).T

    def predict(self, X, threshold=0.5):
        return (self._score(X) >= threshold).astype(int)

def main():
    
    train_raw = pd.read_csv("train_c.csv")

    # EDA: таргет
    plt.figure(figsize=(4, 3))
    train_raw["LoanApproved"].value_counts(normalize=True).plot(kind="bar")
    plt.title("Распределение LoanApproved")
    plt.tight_layout()
    plt.savefig("eda_target_distribution.png")

    # EDA: пара зависимостей
    plt.figure(figsize=(5, 4))
    plt.scatter(train_raw["CreditScore"], train_raw["LoanAmount"], alpha=0.3)
    plt.xlabel("CreditScore")
    plt.ylabel("LoanAmount")
    plt.title("CreditScore vs LoanAmount")
    plt.tight_layout()
    plt.savefig("eda_credit_vs_loan.png")

    # EDA: корреляционная матрица
    plt.figure(figsize=(10, 8))
    corr = train_raw.corr(numeric_only=True)
    sns.heatmap(corr, cmap="coolwarm", center=0)
    plt.title("Correlation matrix")
    plt.tight_layout()
    plt.savefig("eda_corr_matrix.png")


    train = train_raw.copy()
    test = pd.read_csv("test_c.csv")
    test_ids = test["ID"].copy()


    print(f"\nПропущенные значения в LoanApproved: {train['LoanApproved'].isnull().sum()}")
    print(f"Распределение до обработки: {train['LoanApproved'].value_counts().to_dict()}")

    train = train.dropna(subset=["LoanApproved"]).reset_index(drop=True)
    y = train["LoanApproved"].astype(int)

    print(f"\nПосле удаления NaN: {len(train)} строк")
    print(f"Баланс классов: {y.value_counts(normalize=True).to_dict()}")

    augment_features(train)
    augment_features(test)

    education_mapping = {
        "High School": 1, "high school": 1,
        "Associate": 2, "associate": 2,
        "Bachelor": 3, "bachelor": 3,
        "Master": 4, "master": 4,
        "Doctorate": 5, "doctorate": 5,
        "PhD": 5, "phd": 5
    }

    if "EducationLevel" in train.columns:
        train["EducationLevel"] = train["EducationLevel"].map(education_mapping).fillna(0).astype(float)
    if "EducationLevel" in test.columns:
        test["EducationLevel"] = test["EducationLevel"].map(education_mapping).fillna(0).astype(float)

    # 4. Разделение признаков / таргета
    X = train.drop(columns=["LoanApproved"])
    X_test = test.drop(columns=["ID"])

    # синхронизация признаков
    common_cols = list(set(X.columns) & set(X_test.columns))
    X = X[common_cols]
    X_test = X_test[common_cols]

    print(f"\nПосле синхронизации: {len(common_cols)} признаков")
    print(f"X shape: {X.shape}, X_test shape: {X_test.shape}")

    # Числовые / категориальные
    numeric_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
    cat_cols = [c for c in X.columns if c not in numeric_cols]
    print(f"Признаки: {len(numeric_cols)} числовых, {len(cat_cols)} категориальных")

    # Пропуски
    numeric_imputer = SimpleImputer(strategy="median")
    categorical_imputer = SimpleImputer(strategy="most_frequent")

    X_numeric = numeric_imputer.fit_transform(X[numeric_cols])
    X_categorical = categorical_imputer.fit_transform(X[cat_cols])
    X_test_numeric = numeric_imputer.transform(X_test[numeric_cols])
    X_test_categorical = categorical_imputer.transform(X_test[cat_cols])

    # лог‑преобразование
    X_numeric = signed_log1p(X_numeric)
    X_test_numeric = signed_log1p(X_test_numeric)

    # OHE
    if len(cat_cols) > 0:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype="float32")
        X_cat_encoded = ohe.fit_transform(X_categorical)
        X_test_cat_encoded = ohe.transform(X_test_categorical)
        X_processed = np.hstack([X_numeric, X_cat_encoded])
        X_test_processed = np.hstack([X_test_numeric, X_test_cat_encoded])
    else:
        X_processed = X_numeric
        X_test_processed = X_test_numeric

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_processed)
    X_test_scaled = scaler.transform(X_test_processed)

    print(f"Финальная размерность: {X_scaled.shape}")

    X_train, X_val, y_train, y_val = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    y_train = np.asarray(y_train)
    y_val = np.asarray(y_val)


    print("\nПроверка собственных метрик")
    tmp_clf = GradientBoostingClassifier(random_state=42)
    tmp_clf.fit(X_train, y_train)
    y_tmp = tmp_clf.predict(X_val)

    print("Accuracy:", my_accuracy(y_val, y_tmp), accuracy_score(y_val, y_tmp))
    print("Precision:", my_precision(y_val, y_tmp), precision_score(y_val, y_tmp))
    print("Recall:", my_recall(y_val, y_tmp), recall_score(y_val, y_tmp))
    print("F1:", my_f1(y_val, y_tmp), f1_score(y_val, y_tmp))


    print("\nСравнение бэггинга")
    base = DecisionTreeClassifier(max_depth=5, random_state=42)

    my_bag = MyBagging(base_estimator=base, n_estimators=20, max_samples=0.8, random_state=42)
    my_bag.fit(X_train, y_train)
    my_bag_proba = my_bag.predict_proba(X_val)[:, 1]
    my_bag_auc = roc_auc_score(y_val, my_bag_proba)

    sk_bag = BaggingClassifier(estimator=base, n_estimators=20, max_samples=0.8, random_state=42)
    sk_bag.fit(X_train, y_train)
    sk_bag_proba = sk_bag.predict_proba(X_val)[:, 1]
    sk_bag_auc = roc_auc_score(y_val, sk_bag_proba)

    print(f"MyBagging ROC-AUC: {my_bag_auc:.4f}")
    print(f"sklearn Bagging ROC-AUC: {sk_bag_auc:.4f}")


    print("\nСравнение градиентного бустинга")
    my_gb = MyGradientBoosting(n_estimators=5, learning_rate=0.1, max_depth=1)
    my_gb.fit(X_train, y_train)
    my_gb_proba = my_gb.predict_proba(X_val)[:, 1]
    my_gb_auc = roc_auc_score(y_val, my_gb_proba)

    sk_gb = GradientBoostingClassifier(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=2,
        random_state=42
    )
    sk_gb.fit(X_train, y_train)
    sk_gb_proba = sk_gb.predict_proba(X_val)[:, 1]
    sk_gb_auc = roc_auc_score(y_val, sk_gb_proba)

    print(f"MyGradientBoosting ROC-AUC: {my_gb_auc:.4f}")
    print(f"sklearn GradientBoosting ROC-AUC: {sk_gb_auc:.4f}")


    print("\nСравнение бустингов (sklearn / LGBM / XGB / CatBoost)")

    models = {
        "sklearn_GB": GradientBoostingClassifier(
            n_estimators=30,
            learning_rate=0.1,
            max_depth=2,
            random_state=42
        ),
        "LightGBM": lgb.LGBMClassifier(
            n_estimators=50,
            learning_rate=0.1,
            num_leaves=31,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        ),
        "XGBoost": xgb.XGBClassifier(
            n_estimators=50,
            learning_rate=0.1,
            max_depth=4,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            random_state=42
        ),
        "CatBoost": CatBoostClassifier(
            iterations=50,
            learning_rate=0.1,
            depth=4,
            loss_function="Logloss",
            eval_metric="AUC",
            verbose=False,
            random_state=42
        )
    }

    model_scores = {}

    subsample_cmp = min(3000, X_train.shape[0])
    idx_cmp = np.random.choice(X_train.shape[0], subsample_cmp, replace=False)
    X_cmp = X_train[idx_cmp]
    y_cmp = y_train[idx_cmp]

    for name, mdl in models.items():
        mdl.fit(X_cmp, y_cmp)  # вместо X_train, y_train
        proba = mdl.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, proba)
        print(f"{name}: ROC-AUC = {score:.4f}")
        model_scores[name] = score

    best_model_name = max(model_scores, key=model_scores.get)
    print(f"\nЛучшая модель по ROC-AUC: {best_model_name} ({model_scores[best_model_name]:.4f})")

    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 600),
            "num_leaves": trial.suggest_int("num_leaves", 16, 64),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", -1, 10),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "random_state": 42,
        }
        clf = lgb.LGBMClassifier(**params)
        clf.fit(X_train, y_train)
        proba = clf.predict_proba(X_val)[:, 1]
        return roc_auc_score(y_val, proba)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    print("Best params:", study.best_params)
    print("Best ROC-AUC:", study.best_value)

    best_lgbm = lgb.LGBMClassifier(**study.best_params)
    best_lgbm.fit(X_train, y_train)
    y_val_proba = best_lgbm.predict_proba(X_val)[:, 1]
    y_val_pred = (y_val_proba >= 0.5).astype(int)

    # метрики
    val_roc_auc = roc_auc_score(y_val, y_val_proba)
    val_precision = precision_score(y_val, y_val_pred)
    val_recall = recall_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    val_pr_auc = my_pr_auc(y_val, y_val_proba)

    print("\nМЕТРИКИ ЛУЧШЕЙ МОДЕЛИ validation:")
    print(f"ROC-AUC: {val_roc_auc:.4f}")
    print(f"Precision: {val_precision:.4f}")
    print(f"Recall:    {val_recall:.4f}")
    print(f"F1-Score:  {val_f1:.4f}")
    print(f"PR-AUC:    {val_pr_auc:.4f}")

    cv_scores = cross_val_score(best_lgbm, X_scaled, y, cv=3, scoring="roc_auc", n_jobs=1)
    print(f"\nCV ROC-AUC: {cv_scores.mean():.4f} (+- {cv_scores.std():.4f})")

    y_test_proba = best_lgbm.predict_proba(X_test_scaled)[:, 1]
    y_test_pred = (y_test_proba >= 0.5).astype(int)

    submission = pd.DataFrame({
        "ID": test_ids,
        "LoanApproved": y_test_pred
    })

    submission.to_csv("submission.csv", index=False)
   
    fpr, tpr, _ = roc_curve(y_val, y_val_proba)
    pr_precision, pr_recall, _ = precision_recall_curve(y_val, y_val_proba)
    pr_auc = auc(pr_recall, pr_precision)

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f"ROC (AUC = {val_roc_auc:.4f})", linewidth=2.5, color="blue")
    plt.plot([0, 1], [0, 1], "k--", label="Random", linewidth=1)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(alpha=0.3)

    plt.subplot(1, 2, 2)
    plt.plot(pr_recall, pr_precision, label=f"PR (AUC = {pr_auc:.4f})", linewidth=2.5, color="green")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend()
    plt.grid(alpha=0.3)

    plt.tight_layout()
    plt.savefig("roc_pr_curves.png", dpi=300, bbox_inches="tight")


if __name__ == "__main__":
    main()
