# Get the data

Get the data processed in the previous notebook (Exploratory Data Analysis)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 


In [None]:
from src.constants import X_TRAIN_PATH, X_TEST_PATH, Y_TRAIN_PATH, Y_TEST_PATH

# save the processed data to their corresponding files
X_train = pd.read_csv(filepath_or_buffer=X_TRAIN_PATH, sep=',')
X_test = pd.read_csv(filepath_or_buffer=X_TEST_PATH, sep=',')

y_train = pd.read_csv(filepath_or_buffer=Y_TRAIN_PATH, sep=',')
y_test = pd.read_csv(filepath_or_buffer=Y_TEST_PATH, sep=',')

In [None]:
# verify x_train
X_train.head()

In [None]:
# verify x_test
X_test.head()

In [None]:
# verify y_train
y_train.head()

In [None]:
# verify y_test
y_test.head()

# Option 1. Model without any optimization

## Step 1: Initialization and training of the model

In [None]:
from sklearn.linear_model import LogisticRegression

# model with max iteration 1000, without 1000 the model give the error ConvergenceWarning
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train.values.ravel())

## Step 2: Model prediction

In [None]:
# make the prediction
y_pred = model.predict(X_test)
y_pred

In [None]:
# make the prediction of the probabilities of being one class or another
y_prob = model.predict_proba(X_test)

## Step 3: Metrics

Let's print the full report of the model

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            log_loss, classification_report)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_prob[:, 1])
confusion = confusion_matrix(y_test, y_pred)
log_loss = log_loss(y_test, y_prob)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'AUC-ROC: {auc_roc}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Log Loss: {log_loss}')
print(f'Classification Report:\n{report}')

Here the analysis.

Now let's draw the confusion matrix

In [None]:
from src.utils import draw_confusion_matrix

draw_confusion_matrix(confusion=confusion)

The interpretation of a confusion matrix is as follows:

- **True positive (TP)**: corresponds to the number TP and are the cases where the model predicted positive **(what the label means)** and the actual class is also positive.
- **True negative (TN)**: Corresponds to the number TN and are the cases where the model predicted negative **(what the label means)** and the actual class is also negative.
- **False positive (FP)**: Corresponds to the number FP and are the cases in which the model predicted positive, but the actual class is negative.
- **False negative (FN)**: Corresponds to the number FN and are the cases where the model predicted negative, but the actual class is positive.

# Option 2. Model with optimization

In [None]:
# create another model
opt_model = LogisticRegression(random_state=42)

## Step 1: Create the hyperparameter optimization model

In [None]:
from sklearn.model_selection import GridSearchCV

# define the parameters that we want to adjust by hand, depends on the model to use
hyperparams = {

}

# initialize the grid
grid = GridSearchCV(opt_model, hyperparams, scoring = 'accuracy', cv = 5, n_jobs=-1, verbose=1)

## Step 2: Fit the model

In [None]:
grid.fit(X_train, y_train.values.ravel())

print(f"Best hyperparameters: {grid.best_params_}")

## Step 3: Get the best model and predict

In [None]:
# get the best parameters and model
best_params = grid.best_params_
best_model: LogisticRegression = grid.best_estimator_

In [None]:
# predict the values
y_pred = best_model.predict(X_test)

In [None]:
# make the prediction of the probabilities of being one class or another
y_prob = best_model.predict_proba(X_test)

## Step 4: Metrics of the model

In [None]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                            f1_score, roc_auc_score, confusion_matrix,
                            log_loss, classification_report)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_prob[:, 1])
confusion = confusion_matrix(y_test, y_pred)
log_loss = log_loss(y_test, y_prob)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'AUC-ROC: {auc_roc}')
print(f'Confusion Matrix:\n{confusion}')
print(f'Log Loss: {log_loss}')
print(f'Classification Report:\n{report}')

In [None]:
draw_confusion_matrix(confusion=confusion)

The interpretation of a confusion matrix is as follows:

- **True positive (TP)**: corresponds to the number TP and are the cases where the model predicted positive **(what the label means)** and the actual class is also positive.
- **True negative (TN)**: Corresponds to the number TN and are the cases where the model predicted negative **(what the label means)** and the actual class is also negative.
- **False positive (FP)**: Corresponds to the number FP and are the cases in which the model predicted positive, but the actual class is negative.
- **False negative (FN)**: Corresponds to the number FN and are the cases where the model predicted negative, but the actual class is positive.

Analysis here.

# Conclusion

Make a table with comparisons here