#Imports

In [315]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import datasets

#Dataset

In [311]:
class Dataset:

    def __init__(self, link):
        self.data = pd.read_csv(link)
        self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        print(self.data.head())

    def prepare(self, target):
        self.target = target
        # lb = LabelBinarizer()
        # lb.fit(self.data[target])
        # self.data[target] = lb.transform(self.data[target])

    def split(self, size):
        y = self.data[self.target].copy()
        X = self.data.copy()
        X = X.drop(columns=[self.target, 'id'])
        (self.X_train, self.X_test, self.y_train, self.y_test) = train_test_split(X, y, test_size=size)
    
    def visualize(self):
        countplot = plt.figure(1)
        ax_cp = sns.countplot(y=self.data[self.target])
        countplot.suptitle('Samples count')
        ax_cp.set(xlabel='Class', ylabel='Count')
        countplot.show()

        cor_m = self.X_train.corr()
        heatmap = plt.figure(2)
        heatmap.suptitle('Correlation matrix')
        ax_hm = sns.heatmap(cor_m, annot=True, annot_kws={'size': 6})
        heatmap.show()

        # pairplot = plt.figure(3)
        # pairplot.suptitle("Pairplot")
        # sns.pairplot(self.data, hue = self.target, corner = True, palette = "bright")
        # pairplot.show()

    def classify(self):
        (self.pred_sklearn, self.acc_sklearn) = sklearn_GaussianNB(self.X_train, self.y_train, self.X_test, self.y_test)
        x = GaussianNaiveBayes()
        x.fit(self.X_train, self.y_train)
        (self.pred, self.acc) = x.predict(self.X_test, self.y_test)
        self.scores = x.posteriors
        print('Both classifiers trained and tested!\n')

    def show_accuracy(self):
        print("\nAccuracy of pre-build GaussianNB classifier is {:2.2%}".format(self.acc_sklearn))
        print("Accuracy of our GaussianNB classifier is {:2.2%}".format(self.acc))
        print("\nConfusion matrix")
        print(confusion_matrix(self.y_test, self.pred))
        print("\nClassification Report")
        print(classification_report(self.y_test, self.pred))

        y_test_labels = LabelBinarizer().fit_transform(self.y_test)
        fpr, tpr, _ = roc_curve(y_test_labels, np.array(df_data.scores)[:,1])
        roc_auc = auc(fpr, tpr)
        plt.title('AUC = %0.2f' % roc_auc)
        plt.plot(fpr, tpr, 'b')
        plt.legend(loc='lower right')
        plt.plot([0, 1], [0, 1], 'r--')
        plt.xlim([0, 1])
        plt.ylim([0, 1])
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.show()

        # # calculate scores
        # # auc = roc_auc_score(self.y_test, self.pred)
        # # print('ROC AUC=%.3f' % (auc))
        # fpr, tpr, _ = roc_curve(self.y_test, self.pred)

        # # plot the roc curve for the model
        # plt.plot(fpr, tpr, linestyle='--', label='ROC Curve')
        # plt.xlabel('False Positive Rate')
        # plt.ylabel('True Positive Rate')
        # plt.legend()
        # plt.show()

#Gaussian Naive Bayes Implementation

In [300]:
class GaussianNaiveBayes:

    def calc_prior(self, features, target):
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()
        return self.prior

    def calc_statistics(self, features, target):
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
        return (self.mean, self.var)

    def gaussian_density(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp(-1 / 2 * (x - mean) ** 2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob

    def calc_posterior(self, x):
        posteriors_for_one = []
        for i in range(self.count):
            prior = np.log(self.prior[i])
            conditional = np.sum(np.log(self.gaussian_density(i, x)))
            posterior = prior + conditional
            posteriors_for_one.append(posterior)
        self.posteriors.append(posteriors_for_one)
        return self.classes[np.argmax(posteriors_for_one)]

    def fit(self, features, target):
        self.features = features
        self.target = target
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        self.posteriors = []

    def predict(self, X_test, y_test):
        y_prediction = [self.calc_posterior(f) for f in X_test.to_numpy()]
        accuracy = np.sum(y_test == y_prediction) / len(y_test)
        return (y_prediction, accuracy)


In [316]:
def sklearn_GaussianNB(X_train, y_train, X_test, y_test):
    gnb = GaussianNB();
    model = gnb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = np.sum(y_test == y_pred) / len(y_test)
    return y_pred, accuracy

#Misc

In [313]:
df_data = Dataset("https://raw.githubusercontent.com/Fraisel/ML-Labs/main/breast-cancer-wisconsin.csv")
df_data.prepare("diagnosis")
df_data.split(0.2)
# df_data.data

df_data.classify()

         id diagnosis  ...  symmetry_worst  fractal_dimension_worst
0    842302         M  ...          0.4601                  0.11890
1    842517         M  ...          0.2750                  0.08902
2  84300903         M  ...          0.3613                  0.08758
3  84348301         M  ...          0.6638                  0.17300
4  84358402         M  ...          0.2364                  0.07678

[5 rows x 32 columns]
Both classifiers trained and tested!



In [314]:
df_data.show_accuracy()

No handles with labels found to put in legend.



Accuracy of pre-build GaussianNB classifier is 96.49%
Accuracy of our GaussianNB classifier is 94.74%

Confusion matrix
[[75  1]
 [ 5 33]]

Classification Report
              precision    recall  f1-score   support

           B       0.94      0.99      0.96        76
           M       0.97      0.87      0.92        38

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



KeyError: ignored

<Figure size 1224x576 with 1 Axes>