# Get the data

Get the data processed in the previous notebook (Exploratory Data Analysis)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from src.constants import X_TRAIN_PATH, X_TEST_PATH, Y_TRAIN_PATH, Y_TEST_PATH

# save the processed data to their corresponding files
X_train = pd.read_csv(filepath_or_buffer=X_TRAIN_PATH, sep=',')
X_test = pd.read_csv(filepath_or_buffer=X_TEST_PATH, sep=',')

y_train = pd.read_csv(filepath_or_buffer=Y_TRAIN_PATH, sep=',')
y_test = pd.read_csv(filepath_or_buffer=Y_TEST_PATH, sep=',')

In [None]:
# verify x_train
X_train.head()

In [None]:
# verify x_test
X_test.head()

In [None]:
# verify y_train
y_train.head()

In [None]:
# verify y_test
y_test.head()

# Option 1. Default model

## Step 1: Initialization and training of the model

In [None]:
#from sklearn import ModelType

model = ModelType(random_state = 42)
model.fit(X_train, y_train)

In [None]:
print(f"Hyperparameters of the default model: {model.get_params()}")

## Step 2: Model prediction

In [None]:
# make the prediction
y_pred = model.predict(X_test)
y_pred

In [None]:
# make the prediction of the probabilities of being one class or another
y_prob = model.predict_proba(X_test)

## Step 3: Metrics

Let's print the full report of the model

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            log_loss, classification_report)

default_model_accuracy = accuracy_score(y_test, y_pred)
default_model_precision = precision_score(y_test, y_pred)
default_model_recall = recall_score(y_test, y_pred)
default_model_f1 = f1_score(y_test, y_pred)
default_model_auc_roc = roc_auc_score(y_test, y_prob[:, 1])
default_model_log_loss = log_loss(y_test, y_prob)
default_model_confusion = confusion_matrix(y_test, y_pred)
default_model_report = classification_report(y_test, y_pred)

print(f'Accuracy: {default_model_accuracy}')
print(f'Precision: {default_model_precision}')
print(f'Recall: {default_model_recall}')
print(f'F1-Score: {default_model_f1}')
print(f'AUC-ROC: {default_model_auc_roc}')
print(f'Confusion Matrix:\n{default_model_confusion}')
print(f'Log Loss: {default_model_log_loss}')
print(f'Classification Report:\n{default_model_report}')

Here the analysis.

Now let's draw the confusion matrix

In [None]:
from src.draw_utils import draw_confusion_matrix

draw_confusion_matrix(confusion=default_model_confusion)

The interpretation of a confusion matrix is as follows:

- **True positive (TP)**: corresponds to the number 40 and are the cases where the model predicted positive **(the person has diabetes)** and the actual class is also positive.
- **True negative (TN)**: Corresponds to the number 75 and are the cases where the model predicted negative **(the person does not have diabetes)** and the actual class is also negative.
- **False positive (FP)**: Corresponds to the number 24 and are the cases in which the model predicted positive, but the actual class is negative.
- **False negative (FN)**: Corresponds to the number 15 and are the cases where the model predicted negative, but the actual class is positive.

# Option 2. Model with optimization

In [None]:
# create another model
opt_model = ModelType(random_state=42)

## Step 1: Create the hyperparameter optimization model

In [None]:
from sklearn.model_selection import GridSearchCV

# define the parameters that we want to adjust by hand, depends on the model to use
hyperparams = {

}

# initialize the grid
grid = GridSearchCV(opt_model, hyperparams, scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=1)

## Step 2: Fit the model

In [None]:
grid.fit(X_train, y_train.values)

print(f"Best hyperparameters: {grid.best_params_}")

## Step 3: Get the best model and predict

In [None]:
# get the best parameters and model
best_params = grid.best_params_
best_model: ModelType = grid.best_estimator_

In [None]:
# predict the values
y_pred = best_model.predict(X_test)

In [None]:
# make the prediction of the probabilities of being one class or another
y_prob = best_model.predict_proba(X_test)

## Step 4: Metrics of the model

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            log_loss, classification_report)

optimized_model_accuracy = accuracy_score(y_test, y_pred)
optimized_model_precision = precision_score(y_test, y_pred)
optimized_model_recall = recall_score(y_test, y_pred)
optimized_model_f1 = f1_score(y_test, y_pred)
optimized_model_auc_roc = roc_auc_score(y_test, y_prob[:, 1])
optimized_model_log_loss = log_loss(y_test, y_prob)
optimized_model_confusion = confusion_matrix(y_test, y_pred)
optimized_model_report = classification_report(y_test, y_pred)

print(f'Accuracy: {optimized_model_accuracy}')
print(f'Precision: {optimized_model_precision}')
print(f'Recall: {optimized_model_recall}')
print(f'F1-Score: {optimized_model_f1}')
print(f'AUC-ROC: {optimized_model_auc_roc}')
print(f'Confusion Matrix:\n{optimized_model_confusion}')
print(f'Log Loss: {optimized_model_log_loss}')
print(f'Classification Report:\n{optimized_model_report}')

In [None]:
draw_confusion_matrix(confusion=optimized_model_confusion)

The interpretation of a confusion matrix is as follows:

- **True positive (TP)**: corresponds to the number 42 and are the cases where the model predicted positive **(the person has diabetes)** and the actual class is also positive.
- **True negative (TN)**: Corresponds to the number 72 and are the cases where the model predicted negative **(the person does not have diabetes)** and the actual class is also negative.
- **False positive (FP)**: Corresponds to the number 27 and are the cases in which the model predicted positive, but the actual class is negative.
- **False negative (FN)**: Corresponds to the number 13 and are the cases where the model predicted negative, but the actual class is positive.

# Conclusion

In [None]:

from src.markdown_utils import show_comparison_table

# set the metrics to use
metrics: list[str] = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']

# get the metrics of the default and optimized list values
default_model_metrics: list[float] = [default_model_accuracy, default_model_precision, default_model_recall, default_model_f1, default_model_auc_roc]
optimized_model_metrics: list[float] = [optimized_model_accuracy, optimized_model_precision, optimized_model_recall, optimized_model_f1, optimized_model_auc_roc]

# construct the Markdown table
show_comparison_table(
    metric_names=metrics,
    default_metrics=default_model_metrics,
    optimized_metrics=optimized_model_metrics,
)

In [None]:
from src.draw_utils import draw_comparison_confusion_matrices

draw_comparison_confusion_matrices(confusion_1=default_model_confusion, confusion_2=optimized_model_confusion, confusion_matrix_1_name='Default model', confusion_matrix_2_name='Optimized model')

WE can see that the default model and the optimized one are really close, the metrics are not too different, one model has a better performance with the negative label 0 (do not have diabetes) and other model have better results with the positive label 1 (have diabetes).

The default model has a better performance with the label 1, it has better accuracy and precision. While the optimized model has a better performance with the label 0 and these results are backed up by the metrics, because the optimized model has better recall, f1-score and AUC-ROC.