# Trabalho 03: Naive Bayes Bayesiano
Andreza Fernandes de Oliveira


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
%matplotlib inline

path = "/content/drive/MyDrive/Acadêmico/UFC/Mestrado/Disciplinas/2020.2/Tópicos Especiais em Lógica II/TRABALHOS/Trabalho 03/"

## 01. Implementação do modelo de Naive Bayes Bayesiano

### a. Leitura do dado e tratamento

In [3]:
data = pd.read_csv(path+"votesDataset.csv", header=None)
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,,,,,,,,,,,,,,,,,
1,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
5,democrat,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
6,democrat,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
7,democrat,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
8,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
9,republican,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


In [9]:
data.shape

(436, 17)

In [10]:
# plt.figure(figsize=(10,10))
# plt.scatter(data[0], data[1], alpha=0.2, s=120, c=data[2], cmap='viridis')
# plt.xlabel('X0')
# plt.ylabel('X1');
# plt.title(f'Distribuição dos dados')

### b. Funções

In [None]:
class GaussianDiscriminant():
  def __init__(self):
    pass

  def fitAGD(self, X, y):
    self.__classes = np.unique(y) # pegando as classes e ocorrencias
    numClasses = len(self.__classes) #numero de classes
    n, numFeatures = X.shape # numero de linhas e colunas do dataset 
    
    self.__sigma = 0
    self.__probabilidadeClasses = np.zeros((numClasses, 1))
    self.__media = np.zeros((numClasses, numFeatures)) # criando a lista da media das classes por feature

    for i in range(numClasses):
        # flatnonzero retorna os indices em que os valores são diferentes de zero. O retorno é uma lista flat. 
        # A comparação y == classes vai retornar um array de 0's e 1's. 
        # E portanto a operaçao final vai pegar todos os indices em que a classe é igual ao y.
        indices = np.flatnonzero(y == self.__classes[i])

        self.__probabilidadeClasses[i] = len(indices)/ n
        self.__media[i] = np.mean(X[indices], axis=0)
        self.__sigma += np.cov(X[indices].T) *(len(indices)-1)
        
    self.__sigma /= n
    
    return {'media': self.__media, 'covar': self.__sigma, 'classes': self.__classes, 'numRows': n, 'numClasses': numClasses, 
            'numFeatures': numFeatures, 'probabilidadeClasses': self.__probabilidadeClasses }

  def predict(self,X):
    # Usando a distribuição gaussiana multivariada da biblioteca scipy.stats
    # Basicamente o calculo da função densidade de probabilidade visto na imagem abaixo.
    pdf = lambda mean: multivariate_normal.pdf(X, mean=mean, cov=self.__sigma)

    # Apply a function to 1-D slices along the given axis
    # No caso aplica a função fdp no axis 1 das médias
    y_probs = np.apply_along_axis(pdf, 1, self.__media) * self.__probabilidadeClasses

    return self.__classes[np.argmax(y_probs, axis=0)]

# 2. Visualizações

In [None]:
from scipy import stats

def plot_data(x, t, label=None):
    plt.scatter(x, t, marker='o', c="k", s=50, label=label)

def plot_valor_esperado(x, y, label=None):
    plt.plot(x, y, 'x', color='r', label=label)

def plot_predictive(x, y, En, std_times=1):
    y = y.ravel()
    En = np.sqrt(En.ravel()) * std_times

    plt.plot(x, y, label="Predição")
    plt.fill_between(x.ravel(), y + En, y - En, alpha = .5, label="Incerteza")

def plot_posterior_samples(x, ys, plot_xy_labels=True):
    plt.plot(x, ys[:, 0], 'r--', alpha=0.5, label='Modelos')
    for i in range(1, ys.shape[1]):
        plt.plot(x, ys[:, i], 'r--', alpha=0.5)

def plot_posterior(mean, cov, resolution = 100):
    grid_x = grid_y = np.linspace(-1, 1, resolution)
    grid_flat = np.dstack(np.meshgrid(grid_x, grid_y)).reshape(-1, 2)

    densities = stats.multivariate_normal.pdf(grid_flat, mean=mean.ravel(), cov=cov).reshape(resolution, resolution)
    plt.imshow(densities, origin='lower', extent=(-1, 1, -1, 1))