# Gaussian Discriminant Analysis

# Imports

In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt

sys.path.append('../')

from utils.preprocessing import train_test_split, kfolds_cross_validation
from utils.output_utils import do_cv_and_get_metrics_classification

# Process

In [2]:
data = np.genfromtxt('./data/breastcancer.csv', delimiter=',')
print('Shape:', data.shape)
data[:2,:]
X = data[:, :-1]
y = data[:, [-1]]

Shape: (569, 31)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 0.8, random_seed=64825)

cv_splits = kfolds_cross_validation(
    data=X_train,
    n_folds=10,
    shuffle=True
)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (455, 30)
y_train shape: (455, 1)
X_test shape: (114, 30)
y_test shape: (114, 1)


In [4]:
class MyGaussianDiscriminantAnalysis():
    def __init__(self):        
        pass

    def calculate_sigma(self, X, mu):
        n_features = X.shape[1]
        n_rows = X.shape[0]
        sigma=np.zeros((n_features, n_features))

        for i in range(n_rows):
            x_i = X[i,:].reshape(n_features, 1)
            sigma += (x_i-mu) @ (x_i-mu).T

        return sigma/(n_rows-1)


    def fit(self, X, y):
        
        if len(y.shape)==1:
            y = y.reshape(-1, 1)

        classes = np.unique(y)
        self.class_dict = {classes[i]: i for i in range(len(classes))}

        n_features = X.shape[1]

        # n_classes
        self.phi = np.zeros((len(classes), 1)) 
        # n_classes x n_features
        self.mu = np.zeros((len(classes), n_features)) 
        # n_classes x n_features
        self.sigma = np.zeros((len(classes), n_features, n_features)) 

        for label in classes:
            
            k = self.class_dict[label]

            X_class = X[np.where(y==k)[0], :]
            y_class = y[np.where(y==k)[0], :]
            
            self.phi[k] = len(y_class) / len(y)
            self.mu[k] = np.mean(X_class, axis=0)
            self.sigma[k] = self.calculate_sigma(X_class, self.mu[k].reshape(-1, 1))
            # self.sigma[k] = np.cov(X_class.T)
            

    def predict(self, X):

        classes = list(self.class_dict.keys())
        prob_classes = np.zeros((X.shape[0], len(classes)))
        for i, label in enumerate(classes):

            k = self.class_dict[label]
            sigma_det = np.linalg.det(self.sigma[k])
            sigma_inv = np.linalg.pinv(self.sigma[k])
            mu = self.mu[[k]]

            first_part = -(1/2)*np.log(sigma_det)
            second_part = -(1/2)*np.sum(((X-mu) @ sigma_inv) * (X-mu), axis=1)
            third_part = np.log(self.phi[k])
            
            pred = first_part + second_part + third_part
            prob_classes[:, i] = pred

        preds = []
        for i in range(prob_classes.shape[0]):
            argmax = np.argmax(prob_classes[i, :])
            preds.append(classes[argmax])
            
        return np.array(preds).reshape(-1, 1)


In [5]:
do_cv_and_get_metrics_classification(
    classifier=MyGaussianDiscriminantAnalysis(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    title='My Gaussian Discriminant Analysis'
)

#-------------My Gaussian Discriminant Analysis--------------#

--->	Training Metrics
Accuracy Mean:     	0.9753 | Accuracy Std:   	0.0023
Recall Mean:     	0.9920 | Recall Std:       	0.0018
Precision Mean:     	0.9684 | Precision Std:   	0.0034
F1 Score Mean:     	0.9800 | F1 Score Std:   	0.0020

--->	Validation Metrics
Accuracy Mean:     	0.9516 | Accuracy Std:   	0.0213
Recall Mean:     	0.9636 | Recall Std:       	0.0378
Precision Mean:     	0.9571 | Precision Std:   	0.0308
F1 Score Mean:     	0.9595 | F1 Score Std:   	0.0199

--->	Test Metrics
Accuracy:     	0.9912
Recall:     	0.9873
Precision:     	1.0000
F1 Score:     	0.9936


In [6]:
# TO COMPARE
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

do_cv_and_get_metrics_classification(
    classifier=QuadraticDiscriminantAnalysis(), 
    cv_splits=cv_splits, 
    X_train=X_train, 
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    title='Sklearn - Gaussian Discriminant Analysis'
)

#----------Sklearn - Gaussian Discriminant Analysis----------#

--->	Training Metrics
Accuracy Mean:     	0.9753 | Accuracy Std:   	0.0023
Recall Mean:     	0.9920 | Recall Std:       	0.0018
Precision Mean:     	0.9684 | Precision Std:   	0.0034
F1 Score Mean:     	0.9800 | F1 Score Std:   	0.0020

--->	Validation Metrics
Accuracy Mean:     	0.9516 | Accuracy Std:   	0.0213
Recall Mean:     	0.9636 | Recall Std:       	0.0378
Precision Mean:     	0.9571 | Precision Std:   	0.0308
F1 Score Mean:     	0.9595 | F1 Score Std:   	0.0199

--->	Test Metrics
Accuracy:     	0.9912
Recall:     	0.9873
Precision:     	1.0000
F1 Score:     	0.9936
