In [None]:
import gzip
import json
import pickle

import ipywidgets as widgets
import pandas as pd
import wqet_grader
from imblearn.over_sampling import RandomOverSampler
from ipywidgets import interact
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from teaching_tools.widgets import ConfusionMatrixWidget

In [None]:
## in this we build model sequenceally to improve on the last model to get to the final model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
over_sampler = RandomOverSampler(random_state =42)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
print("X_train_over shape:", X_train_over.shape)
X_train_over.head()

In [None]:
clf = make_pipeline(
    SimpleImputer(),
    GradientBoostingClassifier()
)

In [None]:
params = {
    "simpleimputer__strategy" : ["mean", "median"],
    "gradientboostingclassifier__n_estimators" : range(20,31,5),
    "gradientboostingclassifier__max_depth" : range(2,5)
}
params

In [None]:
model = GridSearchCV(
    clf,
    param_grid = params,
    cv =5,
    n_jobs = -1,
    verbose =1
)

In [None]:
model.fit(X_train_over, y_train_over)

In [None]:
results = pd.DataFrame(model.cv_results_)
results.sort_values("rank_test_score").head(10)

In [None]:
# Extract best hyperparameters
model.best_params_

In [None]:
acc_train = model.score(X_train, y_train)
acc_test = model.score(X_test, y_test)

print("Training Accuracy:", round(acc_train, 4))
print("Validation Accuracy:", round(acc_test, 4))

In [None]:
ConfusionMatrixDisplay.from_estimator(model, X_test, y_test)

Accurary = (TP + TN) / (TP + TN + FN + FP)

Precision = TP /(TP + FP) 

i.e. basically, actual_postive_values/ model_predicted_positive_values,
it tells you about how many values were actually positive out of all the values that model predicted as postive

Recall = TP / (TP + FN)

i.e. basically, correct postive_values that model predicted out of all the positive values in the data set

In [None]:
print(classification_report(y_test, model.predict(X_test)))

In [None]:
c = ConfusionMatrixWidget(model, X_test, y_test)
c.show()

In [None]:
c.show_eu() # for european union insolvency proceeding cases

In [None]:
##### your own customized threshold slider
def make_cnf_matrix(threshold):
    y_pred_proba = model.predict_proba(X_test)[:,-1]
    y_pred = y_pred_proba > threshold
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    print(f"Profit: €{tp * 100_000_000}")
    print(f"Losses: €{fp * 250_000_000}")
    ConfusionMatrixDisplay.from_predictions(y_test,y_pred, colorbar = False)


thresh_widget = widgets.FloatSlider(min=0, max = 1, value =0.5, step =0.05)

interact(make_cnf_matrix, threshold=thresh_widget);