# Model building

In [None]:
import pickle
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

import configuration as config

## 1. Load data

In [None]:
train_df=pd.read_csv(config.TRAINING_DATAFILE)
test_df=pd.read_csv(config.TESTING_DATAFILE)

## 2. Cross-validation: naive model

In [None]:
def crossval_boosting_model(training_data: pd.DataFrame, target_variable: str):
    X = training_data.drop(columns=target_variable)
    y = training_data[target_variable]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    boosting_model = HistGradientBoostingClassifier(random_state=42)
    scores = cross_val_score(boosting_model, X_train, y_train, cv=5)
    scores

crossval_boosting_model(train_df, 'incident')

## 3. Test-set performance: confusion matrix

In [None]:
# Plot a confusion matrix to evaluate the model's performance on unseen data
def confusion_plot(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)*100
    conf_matrix = confusion_matrix(y_test, y_pred, normalize='true')
    fig, ax = plt.subplots(figsize=(8,6), dpi=100)
    display = ConfusionMatrixDisplay(conf_matrix, display_labels=hist_boost_model.classes_)
    ax.set(title=f'Confusion Matrix for the Diabetes Detection Model with {acc:.2f}% overall accuracy')
    display.plot(ax=ax, values_format='.2%');

confusion_plot(y_test, y_pred)

## 4. Model optimization

In [None]:
# Your code here... Use one of the sklearn hyperparameter optimization functions to optimize the model

## 5. Test-set performance: confusion matrix

In [None]:
confusion_plot(y_test, y_pred)

## 6. Probability distributions

In [None]:
# Your code here... Plot the distributions of predicted probabilities for 
# incidents and non-incidents in the test set. Hint: sklearn has
# model.predict_proba() that returns probabilities rather than class assignments

def probabilities_plot(model):
    probabilities = model.predict_proba()
    sns.histplot(data=probabilities)

probabilities_plot(hist_boost_model)

## 7. Save the optimized model

In [None]:
Path(config.MODEL_DIRECTORY).mkdir(exist_ok=True)

with open(config.MODEL, 'rb') as output_file:
    pickle.dump(model, output_file)