### Data and Imports

In [None]:
import warnings
import pandas as pd
from datetime import UTC, datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from imblearn.pipeline import Pipeline as PL
import numpy as np
from scipy.stats import ttest_1samp
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    make_scorer,
    precision_score,
    recall_score,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier, XGBRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import seaborn as sns
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [None]:
#Skew the data to 50/50

data = pd.read_csv("data/Loan_default.csv")
data = shuffle(data)
out = []
for idx, j in enumerate(data["Default"]):
    if j == (len(out) % 2):
        out.append(data.iloc[idx])


data = pd.DataFrame(out)

In [None]:
# The Data will be all of it
data = pd.read_csv("data/Loan_default_50k.csv")


In [4]:
X = data.drop(["LoanID", "Default"], axis=1)
y = data["Default"]

In [5]:
categorical_features = [
    "Education",
    "EmploymentType",
    "MaritalStatus",
    "HasMortgage",
    "HasDependents",
    "LoanPurpose",
    "HasCoSigner",
]
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features),
        ("cat", OneHotEncoder(), categorical_features),
    ]
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
models_hyper = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, random_state=42),
        "params": {
            "penalty": ["l1", "l2", None],
            "fit_intercept": [True, False],
            "intercept_scaling": [0.1, 1, 10],
            "dual": [True, False],
            "C": [i/10 for i in range(0,50,5)]
        },
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.001, 0.01, 0.1, 0.2],
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 12, 22, 35, 10],
        },
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": range(10,500,10),
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "bootstrap": [True, False],
        },
    },
    "XGB Classifier": {
        "model": XGBClassifier(random_state=42),
        "params": {"n_estimators": [100, 200, 300], "learning_rate": [0.001, 0.01, 0.1, 0.2], "max_depth": [3, 5, 10]},
    },
}
strat = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

### Gridsearch (Takes a long time)

In [None]:
models = {}
scorer = make_scorer(f1_score)
for name, model in models_hyper.items():
    print(f"Starting {name}")
    starting = datetime.now(tz=UTC).timestamp()
    grid_search = GridSearchCV(
        estimator=model["model"],
        param_grid=model.get("params", {}),
        scoring=scorer,
        cv=strat,
        verbose=1,
        n_jobs=-1,
    )

    grid_search.fit(X_train, y_train)
    models.update({name: grid_search.best_estimator_})
    print(f"It took {datetime.now(tz=UTC).timestamp()-starting} seconds")
    print(f"{name} Best Params: {grid_search.best_params_}")
    print(f"{name} Best Score: {grid_search.best_score_}\n")

### Found Params

This is the "Optimal" Models that we found

In [None]:
opt = [
    {
        "Name": "LogisticRegression",
        "Model": LogisticRegression,
        "Best Params": {
            "intercept_scaling": 0.1,
        },
    },
    {
        "Name": "GradientBoostingClassifier",
        "Model": GradientBoostingClassifier,
        "Best Params": {
            "learning_rate": 0.2,
            "max_depth": 3,
            "min_samples_leaf": 10,
        },
    },
    {
        "Name": "RandomForestClassifier",
        "Model": RandomForestClassifier,
        "Best Params": {
            "max_depth": 20,
            "min_samples_split": 5,
            "n_estimators": 200,
        },
    },
    {
        "Name": "XGBClassifier",
        "Model": XGBClassifier,
        "Best Params": {
            "learning_rate": 0.1,
            "max_depth": 3,
            "n_estimators": 200,
        },
    },
]

models = {mod["Name"]:mod["Model"](**mod["Best Params"]) for mod in opt}

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}
res = {}
confusion_matrices = {}
roc_data = {}


for name, model in models.items():
    print(f"Working on {name}")
    pipeline = PL([("preprocessor", preprocessor), ("smote", SMOTE(random_state=42)), ("classifier", model)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, "predict_proba") else None
    cv_scores[name] = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1)

    res[model] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
    }

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc_score = roc_auc_score(y_test, y_prob)

    roc_data[model] = {"fpr": fpr, "tpr": tpr, "auc": auc_score}
    confusion_matrices[model] = confusion_matrix(y_test, y_pred)



Working on LogisticRegression


### Figures

Print those cool plots for our presi

In [None]:
try:
  figure_data = pd.read_csv(open("figs/out.csv"))

  data_points = set(figure_data["Model"].values)
  points = []
  for i in data_points:
      points.append(figure_data.where(figure_data["Model"] == i).dropna())

  import plotly.express as px
  for i in points:
    fig = px.line_polar(i, r="Points",
                        theta="Catagory",
                        color="Model",
                        line_close=True,
                        template="plotly_dark",
                        range_r=[0, 100]
                        )

    fig.update_polars(angularaxis_showgrid=False,
                      radialaxis_gridwidth=0,
                      bgcolor="#494b5a",
                      gridshape='linear',
                      radialaxis_showticklabels=False
                      )

    fig.update_layout(paper_bgcolor="#2c2f36")
    fig.show()

  fig = px.line_polar(x, r="Points",
                      theta="Catagory",
                      color="Model",
                      line_close=True,
                      range_r=[0, 100],
                      template="plotly_dark")

  fig.update_polars(angularaxis_showgrid=False,
                    radialaxis_gridwidth=0,
                    bgcolor="#494b5a",
                      gridshape='linear',
                    radialaxis_showticklabels=False
                    )

  fig.update_layout(paper_bgcolor="#2c2f36")
  fig.show()
except ImportError:
  print("You don't have plotly")

Print CV Scores

In [None]:
for idx in cv_scores:
    mean_score = np.mean(cv_scores[idx])
    std_score = np.std(cv_scores[idx])

    t_stat, p_value = ttest_1samp(cv_scores[idx], 0.5)


    print(f"Mean AUC Score: {mean_score:.4f}")
    print(f"Standard Deviation of Scores: {std_score:.3f}")
    print(f"T-Statistic: {t_stat:.2f}")
    print(f"P-Value: {p_value:.10f}\n")


LogisticRegression
Mean AUC Score: 0.3320
Standard Deviation of Scores: 0.002
T-Statistic: -155.60
P-Value: 0.0000000102



Print actual bar graphs

In [None]:
for metric in res:
    plt.figure(figsize=(10, 6))
    bars = plt.barh(res[metric].keys(), res[metric].values(), color="lightblue")
    plt.title(metric)
    plt.xlim(0, 1)
    plt.gca().invert_yaxis()

    for bar, score in zip(bars, res[metric].values(), strict=False):
        plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2, f"{score:.3f}", va="center")

    plt.show()


Print a table with the metrics

In [None]:

metrics = ["Accuracy", "AUC-ROC", "Precision", "Recall", "F1 Score"]

model_names = list(res.keys())
scores = np.array([[res[model][metric] for metric in metrics] for model in model_names])

df_results = pd.DataFrame(scores, index=model_names, columns=metrics)

df_results


Print Con Matrix

In [None]:
for i, (name, cm) in confusion_matrices.items():
    plt.plot(3, 3)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 30})
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
