# Programming Assignment 2
## Logistic Regression and Stochastic Methods

Aluno: Francisco Edyvalberty Alenquer Cordeiro \
MatrÃ­cula: 518659


# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Utility Functions

## Metrics

In [2]:
def accuracy(y_true, y_pred):
    y_true = y_true.reshape(-1, 1)
    y_pred = y_pred.reshape(-1, 1)

    right_prediction = y_true == y_pred
    accuracy = right_prediction.sum() / len(y_true)
    return accuracy

def recall(y_true, y_pred):
    y_true = y_true.reshape(-1, 1)
    y_pred = y_pred.reshape(-1, 1)
    array = np.hstack([y_true, y_pred])
    array = array[array[:,0] == 1]
    
    right_prediction = array[:, 0] == array[:, 1]
    recall = right_prediction.sum() / len(array)
    return recall

def precision(y_true, y_pred):
    y_true = y_true.reshape(-1, 1)
    y_pred = y_pred.reshape(-1, 1)
    array = np.hstack([y_true, y_pred])
    array = array[array[:,1] == 1]
    
    right_prediction = array[:, 0] == array[:, 1]
    precision = right_prediction.sum() / len(array)

    return precision

def f1_score(y_true, y_pred):
    y_true = y_true.reshape(-1, 1)
    y_pred = y_pred.reshape(-1, 1)
    precision_score = precision(y_true, y_pred)
    recall_score = recall(y_true, y_pred)

    f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)

    return f1_score

## Standardization

In [3]:
class StandardScaler:
    def __init__(self):
        self.fitted = False

    def fit_transform(self, data):      
        self.mean = data.mean(axis=0)
        self.std = data.std(axis=0)
        self.fitted = True

        scaled_data = (data - self.mean) / self.std
        return scaled_data
    
    def transform(self, data):
        if not self.fitted:
            raise Exception('Scaler not fitted!')

        scaled_data = (data - self.mean) / self.std
        return scaled_data

    def inverse_transform(self, scaled_data):
        if not self.fitted:
            raise Exception('Scaler not fitted!')
        
        original_data = (scaled_data * self.std) + self.mean
        return original_data


## Cross Validation

In [4]:
def kfolds_cross_validation(data, n_folds=10, shuffle=True, random_state=12894):
    indexes = np.arange(data.shape[0])
    if shuffle:
        np.random.seed(random_state)
        np.random.shuffle(indexes)

    slices = np.array_split(indexes, n_folds)
    all_elements = np.hstack(slices)   
    
    splits = []
    for i in range(n_folds):
        train_idx = all_elements[~np.isin(all_elements, slices[i])]
        test_idx = slices[i]

        splits.append((train_idx, test_idx))

    return splits

## Train Test Split

In [5]:
def train_test_split(data, train_size_perc, random_seed=264852):
    N = data.shape[0]
    train_size = int(train_size_perc * N)

    indexes = np.arange(0, N, 1)

    np.random.seed(random_seed)
    train_idx = np.random.choice(indexes, train_size, replace=False)
    test_idx = np.delete(indexes, train_idx)

    train_data = data[train_idx]
    test_data = data[test_idx]

    X_train = train_data[:,:-1]
    y_train = train_data[:,[-1]]

    X_test = test_data[:,:-1]
    y_test = test_data[:,[-1]]

    return X_train, X_test, y_train, y_test

## Do Cross Validation and Get Metrics

In [6]:
def do_cv_and_get_metrics(classifier, cv_splits, X_train, y_train, X_test, title='Classifier', scaler=None):

    X_train = X_train.copy()
    y_train = y_train.copy()
    X_test = X_test.copy()

    train_metrics = {
        'accuracy': [],
        'recall': [],
        'precision': [],
        'f1_score': []
    }

    valid_metrics = {
        'accuracy': [],
        'recall': [],
        'precision': [],
        'f1_score': []
    }

    for train_idx, val_idx in cv_splits:
        # Spliting data
        X_train_cv = X_train[train_idx, :]
        y_train_cv = y_train[train_idx, :]
        X_val_cv = X_train[val_idx, :]
        y_val_cv = y_train[val_idx, :]

        # Scaling if have scaler argument
        if scaler is not None:
            X_train_cv = scaler.fit_transform(X_train_cv)
            X_val_cv = scaler.transform(X_val_cv)

        # Training Model
        classifier.fit(X_train_cv, y_train_cv.ravel())

        # Predictions
        y_train_cv_pred = classifier.predict(X_train_cv)
        y_val_cv_pred = classifier.predict(X_val_cv)

        # Storing metrics
        train_metrics['accuracy'].append(accuracy(y_train_cv, y_train_cv_pred))
        train_metrics['recall'].append(recall(y_train_cv, y_train_cv_pred))
        train_metrics['precision'].append(precision(y_train_cv, y_train_cv_pred))
        train_metrics['f1_score'].append(f1_score(y_train_cv, y_train_cv_pred))

        valid_metrics['accuracy'].append(accuracy(y_val_cv, y_val_cv_pred))
        valid_metrics['recall'].append(recall(y_val_cv, y_val_cv_pred))
        valid_metrics['precision'].append(precision(y_val_cv, y_val_cv_pred))
        valid_metrics['f1_score'].append(f1_score(y_val_cv, y_val_cv_pred))


    # Reporting results
    print('#' + f'{title}'.center(60, '-') + '#')
    print('\n--->\tTraining Metrics')

    print('Accuracy Mean:     \t{0:.4f} | Accuracy Std:   \t{1:.4f}'.format(
        np.mean(train_metrics['accuracy']), 
        np.std(train_metrics['accuracy']))
    )
    print('Recall Mean:     \t{0:.4f} | Recall Std:       \t{1:.4f}'.format(
        np.mean(train_metrics['recall']), 
        np.std(train_metrics['recall']))
    )
    print('Precision Mean:     \t{0:.4f} | Precision Std:   \t{1:.4f}'.format(
        np.mean(train_metrics['precision']), 
        np.std(train_metrics['precision']))
    )
    print('F1 Score Mean:     \t{0:.4f} | F1 Score Std:   \t{1:.4f}'.format(
        np.mean(train_metrics['f1_score']), 
        np.std(train_metrics['f1_score']))
    )

    print('\n--->\tValidation Metrics')

    print('Accuracy Mean:     \t{0:.4f} | Accuracy Std:   \t{1:.4f}'.format(
        np.mean(valid_metrics['accuracy']), 
        np.std(valid_metrics['accuracy']))
    )
    print('Recall Mean:     \t{0:.4f} | Recall Std:       \t{1:.4f}'.format(
        np.mean(valid_metrics['recall']), 
        np.std(valid_metrics['recall']))
    )
    print('Precision Mean:     \t{0:.4f} | Precision Std:   \t{1:.4f}'.format(
        np.mean(valid_metrics['precision']), 
        np.std(valid_metrics['precision']))
    )
    print('F1 Score Mean:     \t{0:.4f} | F1 Score Std:   \t{1:.4f}'.format(
        np.mean(valid_metrics['f1_score']), 
        np.std(valid_metrics['f1_score']))
    )

    print('\n--->\tTest Metrics')

    if scaler is not None:
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    classifier.fit(X_train, y_train.ravel())
    y_test_pred = classifier.predict(X_test)

    print('Accuracy:     \t{0:.4f}'.format(accuracy(y_test, y_test_pred)))
    print('Recall:     \t{0:.4f}'.format(recall(y_test, y_test_pred)))
    print('Precision:     \t{0:.4f}'.format(precision(y_test, y_test_pred)))
    print('F1 Score:     \t{0:.4f}'.format(f1_score(y_test, y_test_pred)))


# Task 1 - Logistic Regression

In [7]:
data = np.genfromtxt('../data/breastcancer.csv', delimiter=',')
print('Shape:', data.shape)
data[:2,:]

Shape: (569, 31)


array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01,
        0.000e+00],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02,
        0.000e+00]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, 0.8, random_seed=64825)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (455, 30)
y_train shape: (455, 1)
X_test shape: (114, 30)
y_test shape: (114, 1)


In [9]:
def sigmoid(x):
    return 1 / (1+np.exp(-x))

def cross_entropy_loss(y, y_pred_proba):
    cost_1 = y.T @ np.log(y_pred_proba)
    cost_0 = (1-y).T @ np.log(1-y_pred_proba)
    j = -(1/len(y)) * (cost_1 + cost_0)
    return j.ravel()[0]

In [10]:
class MyLogisticRegression():
    def __init__(
        self, 
        alpha, 
        n_iterations
    ):        
        self.alpha = alpha        
        self.n_iterations = n_iterations

    def initialize(self, X, y, random_state=654812):
        rnd_state = np.random.RandomState(random_state)
        self.X = np.hstack(
            [np.ones((X.shape[0], 1)), X]
        )
        self.y = y

        self.w = rnd_state.uniform(0, 1, self.X.shape[1]).reshape(-1, 1)

    def fit(self, X, y, random_state=654812):
        
        if len(y.shape)==1:
            y = y.reshape(-1, 1)
            
        self.initialize(X, y, random_state)
        self.gradient_descent()

    def gradient_descent(self):
        self.loss_by_iteration = []
        for i in range(self.n_iterations):
            actual_y_pred_proba = sigmoid(self.X @ self.w)
            e = (self.y - actual_y_pred_proba) 
            
            grad = ((1/len(self.y)) * self.alpha * (e.T @ self.X))
            grad = grad.reshape(-1, 1)
            self.w = self.w + grad 
            

            new_y_pred_proba = sigmoid(self.X @ self.w)
            self.loss_by_iteration.append(
                cross_entropy_loss(self.y, new_y_pred_proba)
            )

    def predict_proba(self, X):
        X = np.hstack(
            [np.ones((X.shape[0], 1)), X]
        )
        predict_proba = sigmoid(X @ self.w)
        return predict_proba

    def predict(self, X, threshold=0.5):
        X = np.hstack(
            [np.ones((X.shape[0], 1)), X]
        )
        predict_proba = sigmoid(X @ self.w)
        predict_label = np.where(predict_proba>threshold, 1, 0)
        return predict_label



In [11]:
do_cv_and_get_metrics(
    classifier=MyLogisticRegression(alpha=0.1, n_iterations=2000), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    scaler=StandardScaler(),
    title='My Logistic Regression'
)

#-------------------My Logistic Regression-------------------#

--->	Training Metrics
Accuracy Mean:     	0.9858 | Accuracy Std:   	0.0024
Recall Mean:     	0.9948 | Recall Std:       	0.0031
Precision Mean:     	0.9822 | Precision Std:   	0.0020
F1 Score Mean:     	0.9885 | F1 Score Std:   	0.0020

--->	Validation Metrics
Accuracy Mean:     	0.9649 | Accuracy Std:   	0.0199
Recall Mean:     	0.9811 | Recall Std:       	0.0267
Precision Mean:     	0.9606 | Precision Std:   	0.0312
F1 Score Mean:     	0.9703 | F1 Score Std:   	0.0199

--->	Test Metrics
Accuracy:     	0.9912
Recall:     	0.9873
Precision:     	1.0000
F1 Score:     	0.9936


In [12]:
# TO COMPARE
from sklearn.linear_model import LogisticRegression
do_cv_and_get_metrics(
    classifier=LogisticRegression(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    scaler=StandardScaler(),
    title='Sklearn Logistic Regression'
)

#----------------Sklearn Logistic Regression-----------------#

--->	Training Metrics
Accuracy Mean:     	0.9878 | Accuracy Std:   	0.0019
Recall Mean:     	0.9980 | Recall Std:       	0.0020
Precision Mean:     	0.9823 | Precision Std:   	0.0020
F1 Score Mean:     	0.9901 | F1 Score Std:   	0.0015

--->	Validation Metrics
Accuracy Mean:     	0.9738 | Accuracy Std:   	0.0189
Recall Mean:     	0.9889 | Recall Std:       	0.0172
Precision Mean:     	0.9668 | Precision Std:   	0.0265
F1 Score Mean:     	0.9775 | F1 Score Std:   	0.0176

--->	Test Metrics
Accuracy:     	0.9825
Recall:     	0.9873
Precision:     	0.9873
F1 Score:     	0.9873


# Task 2 - Gaussian Discriminant Analysis

In [13]:
data = np.genfromtxt('../data/breastcancer.csv', delimiter=',')
print('Shape:', data.shape)
data[:2,:]

Shape: (569, 31)


array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01,
        0.000e+00],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02,
        0.000e+00]])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data, 0.8, random_seed=64825)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (455, 30)
y_train shape: (455, 1)
X_test shape: (114, 30)
y_test shape: (114, 1)


In [15]:
class MyGaussianDiscriminantAnalysis():
    def __init__(self):        
        pass

    def calculate_sigma(self, X, mu):
        n_features = X.shape[1]
        n_rows = X.shape[0]
        sigma=np.zeros((n_features, n_features))

        for i in range(n_rows):
            x_i = X[i,:].reshape(n_features, 1)
            sigma += (x_i-mu) @ (x_i-mu).T

        return sigma/(n_rows-1)


    def fit(self, X, y):
        
        if len(y.shape)==1:
            y = y.reshape(-1, 1)

        classes = np.unique(y)
        self.class_dict = {classes[i]: i for i in range(len(classes))}

        n_features = X.shape[1]

        # n_classes
        self.phi = np.zeros((len(classes), 1)) 
        # n_classes x n_features
        self.mu = np.zeros((len(classes), n_features)) 
        # n_classes x n_features
        self.sigma = np.zeros((len(classes), n_features, n_features)) 

        for label in classes:
            
            k = self.class_dict[label]

            X_class = X[np.where(y==k)[0], :]
            y_class = y[np.where(y==k)[0], :]
            
            self.phi[k] = len(y_class) / len(y)
            self.mu[k] = np.mean(X_class, axis=0)
            self.sigma[k] = self.calculate_sigma(X_class, self.mu[k].reshape(-1, 1))
            # self.sigma[k] = np.cov(X_class.T)
            

    def predict(self, X):

        classes = list(self.class_dict.keys())
        prob_classes = np.zeros((X.shape[0], len(classes)))
        for i, label in enumerate(classes):

            k = self.class_dict[label]
            sigma_det = np.linalg.det(self.sigma[k])
            sigma_inv = np.linalg.pinv(self.sigma[k])
            mu = self.mu[[k]]

            first_part = -(1/2)*np.log(sigma_det)
            second_part = -(1/2)*np.sum(((X-mu) @ sigma_inv) * (X-mu), axis=1)
            third_part = np.log(self.phi[k])
            
            pred = first_part + second_part + third_part
            prob_classes[:, i] = pred

        preds = []
        for i in range(prob_classes.shape[0]):
            argmax = np.argmax(prob_classes[i, :])
            preds.append(classes[argmax])
            
        return np.array(preds).reshape(-1, 1)


In [16]:
do_cv_and_get_metrics(
    classifier=MyGaussianDiscriminantAnalysis(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    title='My Gaussian Discriminant Analysis'
)

#-------------My Gaussian Discriminant Analysis--------------#

--->	Training Metrics
Accuracy Mean:     	0.9753 | Accuracy Std:   	0.0023
Recall Mean:     	0.9920 | Recall Std:       	0.0018
Precision Mean:     	0.9684 | Precision Std:   	0.0034
F1 Score Mean:     	0.9800 | F1 Score Std:   	0.0020

--->	Validation Metrics
Accuracy Mean:     	0.9516 | Accuracy Std:   	0.0213
Recall Mean:     	0.9636 | Recall Std:       	0.0378
Precision Mean:     	0.9571 | Precision Std:   	0.0308
F1 Score Mean:     	0.9595 | F1 Score Std:   	0.0199

--->	Test Metrics
Accuracy:     	0.9912
Recall:     	0.9873
Precision:     	1.0000
F1 Score:     	0.9936


In [17]:
# TO COMPARE
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

do_cv_and_get_metrics(
    classifier=QuadraticDiscriminantAnalysis(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    title='Sklearn - Gaussian Discriminant Analysis'
)

#----------Sklearn - Gaussian Discriminant Analysis----------#

--->	Training Metrics
Accuracy Mean:     	0.9753 | Accuracy Std:   	0.0023
Recall Mean:     	0.9920 | Recall Std:       	0.0018
Precision Mean:     	0.9684 | Precision Std:   	0.0034
F1 Score Mean:     	0.9800 | F1 Score Std:   	0.0020

--->	Validation Metrics
Accuracy Mean:     	0.9516 | Accuracy Std:   	0.0213
Recall Mean:     	0.9636 | Recall Std:       	0.0378
Precision Mean:     	0.9571 | Precision Std:   	0.0308
F1 Score Mean:     	0.9595 | F1 Score Std:   	0.0199

--->	Test Metrics
Accuracy:     	0.9912
Recall:     	0.9873
Precision:     	1.0000
F1 Score:     	0.9936


# Task 3 - Gaussian Naive Bayes

In [18]:
data = np.genfromtxt('../data/breastcancer.csv', delimiter=',')
print('Shape:', data.shape)
data[:2,:]

Shape: (569, 31)


array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01,
        0.000e+00],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02,
        0.000e+00]])

In [19]:
X_train, X_test, y_train, y_test = train_test_split(data, 0.8, random_seed=64825)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (455, 30)
y_train shape: (455, 1)
X_test shape: (114, 30)
y_test shape: (114, 1)


In [20]:
class MyGaussianNaiveBayes():
    def __init__(self):        
        pass

    def fit(self, X, y):

        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        
        classes = np.unique(y)
        self.class_to_idx_dict = {classes[i]: i for i in range(len(classes))}

        n_features = X.shape[1]

        self.prob_class = np.zeros((len(classes), 1)) # n_classes x 1
        self.mu = np.zeros((len(classes), n_features)) # n_classes x n_features
        self.std = np.zeros((len(classes), n_features)) # n_classes x n_features

        for label in classes:

            k = self.class_to_idx_dict[label]

            X_class = X[np.where(y==label)[0], :]
            y_class = y[np.where(y==label)[0], :]
            
            self.prob_class[k] = len(y_class) / len(y)
            self.mu[k] = np.mean(X_class, axis=0)
            self.std[k] = np.std(X_class, axis=0)


    def predict(self, X):

        idx_to_class = {v: k for k, v in self.class_to_idx_dict.items()}
        prob_classes = np.zeros((X.shape[0], len(idx_to_class)))

        for i, label in enumerate(idx_to_class.values()):

            k = self.class_to_idx_dict[label]
            mu = self.mu[[k]]
            std = self.std[[k]]
            prior = self.prob_class[k]

            for idx, x in enumerate(X):

                first_part = np.log(prior)
                second_part = -(1/2) * np.sum(np.log(2*np.pi*(std**2)), axis=1)
                third_part = -(1/2) * np.sum(((x - mu)**2)/(std**2), axis=1)
                
                pred = first_part + second_part + third_part
                prob_classes[idx, i] = pred
                

        preds = []

        for i in range(prob_classes.shape[0]):
            argmax = np.argmax(prob_classes[i, :])
            preds.append(idx_to_class[argmax])
            
        return np.array(preds).reshape(-1, 1)

In [21]:
do_cv_and_get_metrics(
    classifier=MyGaussianNaiveBayes(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    title='My Gaussian Naive Bayes'
)

#------------------My Gaussian Naive Bayes-------------------#

--->	Training Metrics
Accuracy Mean:     	0.9355 | Accuracy Std:   	0.0058
Recall Mean:     	0.9664 | Recall Std:       	0.0065
Precision Mean:     	0.9307 | Precision Std:   	0.0046
F1 Score Mean:     	0.9482 | F1 Score Std:   	0.0046

--->	Validation Metrics
Accuracy Mean:     	0.9275 | Accuracy Std:   	0.0446
Recall Mean:     	0.9531 | Recall Std:       	0.0574
Precision Mean:     	0.9295 | Precision Std:   	0.0428
F1 Score Mean:     	0.9402 | F1 Score Std:   	0.0408

--->	Test Metrics
Accuracy:     	0.9649
Recall:     	0.9747
Precision:     	0.9747
F1 Score:     	0.9747


In [22]:
# TO COMPARE
from sklearn.naive_bayes import GaussianNB

do_cv_and_get_metrics(
    classifier=GaussianNB(var_smoothing=1e-13),
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    title='Sklearn - Gaussian Naive Bayes'
)

#---------------Sklearn - Gaussian Naive Bayes---------------#

--->	Training Metrics
Accuracy Mean:     	0.9355 | Accuracy Std:   	0.0058
Recall Mean:     	0.9664 | Recall Std:       	0.0065
Precision Mean:     	0.9307 | Precision Std:   	0.0046
F1 Score Mean:     	0.9482 | F1 Score Std:   	0.0046

--->	Validation Metrics
Accuracy Mean:     	0.9275 | Accuracy Std:   	0.0446
Recall Mean:     	0.9531 | Recall Std:       	0.0574
Precision Mean:     	0.9295 | Precision Std:   	0.0428
F1 Score Mean:     	0.9402 | F1 Score Std:   	0.0408

--->	Test Metrics
Accuracy:     	0.9649
Recall:     	0.9747
Precision:     	0.9747
F1 Score:     	0.9747
