In [None]:
import pandas as pd
import numpy as np

from scipy.stats import multivariate_normal
import math

from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from scipy.stats import sem, t
from scipy import mean

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Based on: https://towardsdatascience.com/how-to-impliment-a-gaussian-naive-bayes-classifier-in-python-from-scratch-11e0b80faf5a
class gaussClf:

    _ALPHA_MIN = 1e-200
    _ALPHA_MAX = 1e+200

    def separate_by_classes(self, X, y):
        ''' Essa função separa o conjunto de dados em sub-conjuntos por classe '''
        self.classes = np.unique(y)
        classes_index = {}
        subdatasets = {}
        cls, counts = np.unique(y, return_counts=True)
        self.class_freq = dict(zip(cls, counts))
        for class_type in self.classes:
            classes_index[class_type] = np.argwhere(y==class_type)
            subdatasets[class_type] = X[classes_index[class_type], :]
            self.class_freq[class_type] = self.class_freq[class_type]/sum(list(self.class_freq.values()))
        return subdatasets
        
    def meanCov(self,X,y):
      separated_X = self.separate_by_classes(X, y)
      mean_c = {}
      sigma_c = {}
      for class_type in self.classes:
          # Aqui calculamos a média e a matriz Sigma diagonal
          Xc = np.asmatrix(separated_X[class_type])
          mean_c[class_type] = np.asarray(np.mean(Xc, axis=0))[0]
          cov_c = np.zeros((Xc.shape[1], Xc.shape[1]))
          for j in range(Xc.shape[1]):
            var = np.var(Xc[:,[j]].reshape((1,Xc.shape[0])))
            cov_c[j][j] = var          
          sigma_c[class_type] = cov_c
      return mean_c, sigma_c

    def fit(self, X, y):
        ''' Funçãão fit para cada uma das 3 views'''
        self.meansView1, self.sigmaView1 = self.meanCov(X[0],y)
        self.meansView2, self.sigmaView2 = self.meanCov(X[1],y)
        self.meansView3, self.sigmaView3 = self.meanCov(X[2],y)

    def calculate_probability(self, x, mean, sigmad):   
      normal_mult = multivariate_normal.pdf(x,mean=mean,cov=sigmad,allow_singular=True)
      return normal_mult

    def testeP(self, x):
      if x > self._ALPHA_MAX:
        return self._ALPHA_MAX
      if x == 0.0:
        return self._ALPHA_MIN
      return x

    def predict_proba_soma(self, X):
      #número de view
        L = len(X)
        view1 = X[0]
        view2 = X[1]
        view3 = X[2]
        #colunas
        ''' Prediz a probabilidade para todas as classes log-verossimilhança'''
        self.class_prob = {cls:((1-L)*math.log(self.class_freq[cls], math.e)) for cls in self.classes}  

        for cls in self.classes:   
            pView1 = self.testeP(self.calculate_probability(view1, self.meansView1[cls], self.sigmaView1[cls]))
            pView2 = self.testeP(self.calculate_probability(view2, self.meansView2[cls], self.sigmaView2[cls]))
            pView3 = self.testeP(self.calculate_probability(view3, self.meansView3[cls], self.sigmaView3[cls]))
            self.class_prob[cls]+=((math.log(pView1,math.e)) + (math.log(pView2,math.e)) + (math.log(pView3,math.e)))
        self.class_prob = {cls: math.e**self.class_prob[cls] for cls in self.class_prob}
        return self.class_prob

    def predict(self, X):
        ''' Essa função prediz a probabilidade da classe de uma amostra '''
        pred = []
        v1 = X[0]
        v2 = X[1]
        v3 = X[2]
        for i in range(len(X[0])):
            pred_class = None
            max_prob = 0
            x =[v1[i],v2[i],v3[i]]
            for cls, prob in self.predict_proba_soma(x).items():
                if prob>max_prob:
                    max_prob = prob
                    pred_class = cls
            pred.append(pred_class)
        return pred

In [None]:
#dataset
view1 = pd.read_csv('https://raw.githubusercontent.com/Francimaria/Machine-Learning-Python/master/mfeat-fac.csv', header=None, delim_whitespace=True)
view2 = pd.read_csv('https://raw.githubusercontent.com/Francimaria/Machine-Learning-Python/master/mfeat-fou.csv', header=None, delim_whitespace=True)
view3 = pd.read_csv('https://raw.githubusercontent.com/Francimaria/Machine-Learning-Python/master/mfeat-kar.csv', header=None, delim_whitespace=True)

#normalização
view1 = preprocessing.normalize(view1,'max')
view2 = preprocessing.normalize(view2,'max')
view3 = preprocessing.normalize(view3,'max')

#classes
classes = pd.read_csv('https://raw.githubusercontent.com/Francimaria/Machine-Learning-Python/master/cluster_membership_tres_m.csv', header=None, delim_whitespace=True)

y = np.array(classes.values).T[0]



In [None]:
rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=30)

scores = []

bg = gaussClf()

for train_index, test_index in rkf.split(view1,y):
  v1_train, v1_test = view1[train_index], view1[test_index]
  v2_train, v2_test = view2[train_index], view2[test_index]
  v3_train, v3_test = view3[train_index], view3[test_index]
  y_train, y_test = y[train_index], y[test_index]

  X_train = [v1_train,v2_train,v3_train]
  X_test = [v1_test,v2_test,v3_test]

  bg.fit(X_train,y_train)

  y_pred = bg.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  scores.append(acc)

In [None]:
print(scores)

[0.765, 0.69, 0.685, 0.65, 0.695, 0.685, 0.67, 0.72, 0.685, 0.685, 0.7, 0.61, 0.675, 0.695, 0.73, 0.645, 0.725, 0.705, 0.705, 0.71, 0.71, 0.695, 0.695, 0.74, 0.675, 0.67, 0.67, 0.655, 0.68, 0.7, 0.735, 0.745, 0.7, 0.65, 0.68, 0.64, 0.695, 0.705, 0.65, 0.7, 0.73, 0.74, 0.67, 0.665, 0.69, 0.7, 0.615, 0.67, 0.715, 0.68, 0.68, 0.69, 0.68, 0.73, 0.685, 0.72, 0.7, 0.71, 0.645, 0.655, 0.73, 0.665, 0.725, 0.685, 0.705, 0.67, 0.73, 0.68, 0.63, 0.715, 0.7, 0.655, 0.69, 0.665, 0.715, 0.72, 0.65, 0.725, 0.715, 0.675, 0.68, 0.705, 0.735, 0.655, 0.74, 0.68, 0.63, 0.685, 0.715, 0.68, 0.66, 0.68, 0.72, 0.725, 0.65, 0.71, 0.68, 0.72, 0.66, 0.67, 0.64, 0.69, 0.68, 0.705, 0.69, 0.745, 0.635, 0.75, 0.695, 0.675, 0.64, 0.705, 0.67, 0.68, 0.74, 0.68, 0.685, 0.68, 0.72, 0.685, 0.695, 0.645, 0.715, 0.705, 0.63, 0.71, 0.645, 0.76, 0.71, 0.7, 0.665, 0.67, 0.745, 0.675, 0.615, 0.71, 0.71, 0.69, 0.745, 0.65, 0.7, 0.71, 0.695, 0.715, 0.71, 0.665, 0.67, 0.68, 0.67, 0.705, 0.695, 0.725, 0.64, 0.655, 0.705, 0.695, 0.

In [None]:
confidence = 0.95

n = len(scores)
m = mean(scores)
print (m)
std_err = sem(scores)
print (std_err)
h = std_err * t.ppf((1 + confidence) / 2, n - 1)

start = m - h
print(start)
end = m + h
print(end)

0.6898166666666666
0.0018178322588029271
0.6862393006311842
0.693394032702149
