In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import math

In [2]:
data = pd.read_csv('wdbc.txt', sep=",", header=None)

In [3]:
print(data.shape)
data.head()

(569, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,0,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,0,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,0,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,0,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
# On récupère nos labels
y = data[1].to_numpy()

In [5]:
print(y.shape)
print(y[1:20])

(569,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]


In [6]:
# Les features sont les 10 première colonnes du tableau (2 à 11)
X = data.loc[:,2:11].to_numpy()

In [7]:
print(X.shape)
print(X[1:5])

(569, 10)
[[2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02]
 [1.142e+01 2.038e+01 7.758e+01 3.861e+02 1.425e-01 2.839e-01 2.414e-01
  1.052e-01 2.597e-01 9.744e-02]
 [2.029e+01 1.434e+01 1.351e+02 1.297e+03 1.003e-01 1.328e-01 1.980e-01
  1.043e-01 1.809e-01 5.883e-02]]


<center>To compute the mean matrix</center>
<center>j = feature</center>
<center>k = label</center>

![title](img/moyenne.png)

<br>

<center>To compute the standart deviation matrix</center>
<center>j = feature</center>
<center>k = label</center>

![title](img/ecarttype.png)

In [8]:
# Fonction d'entrainement

def train(X, y):
    # N = Nombre d'individu
    # D = nombre de feature
    N, D = X.shape
    
    # K = Nombre de label
    K = np.unique(y).size
    
    # Prior distribution (Probabilité pour chaque label, P(y = k) = nbre_de_k / nbre_total)
    Theta = np.zeros(K)
    
    # mean matrix (Matrice de moyenne de chaque feature correspondant à chaque label)
    Mu = np.zeros((K, D))
    
    # standard deviation matrix
    Sigma = np.zeros((K, D))
    
    for k in range(K):
        Theta[k] = sum(y == k) / N
        X_k = X[y == k, :] # Toutes les lignes ayant pour label k
        Mu[k, :] = X_k.mean(axis=0)
        Sigma[k, :] = X_k.std(axis=0)
        
    return Theta, Mu, Sigma

In [9]:
Theta, Mu, Sigma = train(X, y)

<center>Classification</center>

![title](img/classification.png)

In [10]:
# Fonction de prédiction

def classify(x, Theta, Mu, Sigma):
    K = Theta.size
    posterior = np.zeros(K)
    for k in range(K):
        u = (x - Mu[k, :]) / Sigma[k, :]
        posterior[k] = -np.log(Sigma[k, :]).sum() - (np.transpose(u.reshape((-1, 1))).dot(u.reshape((-1, 1))) / 2) + np.log(Theta[k]) # log(likelihood) + log(prior)
    return np.argmax(posterior)

In [11]:
prediction_100 = classify(X[100, :], Theta, Mu, Sigma)
prediction_100

1

In [12]:
# Fonction d'évaluation

def evaluate(X, y, Theta, Mu, Sigma):
    N = y.size
    y_ = np.zeros(N)
    for i in range(N):
        y_[i] = classify(X[i, :], Theta, Mu, Sigma)
    
    return sum(y_ == y) / N

In [13]:
evaluate(X, y, Theta, Mu, Sigma)

0.9138840070298769