# TP - Introduction aux méthodes de classification d’images

## Manipulation d’un jeu de données

In [17]:
import sklearn.datasets
from sklearn.model_selection import train_test_split

iris = sklearn.datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print(X.shape, y.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(150, 4) (150,)
(112, 4) (112,)
(38, 4) (38,)


### Validations croisées

#### Leave-One-Out

Successivement, l'algorithme va choisir un élément à retirer de la liste de tout les éléments du jeu de donnée, qui va être utiliser pour le test. Les autres seront utilisés pour l'entraînement.
On applique cet algorithme pour chaque élément.

In [18]:
from sklearn.model_selection import LeaveOneOut

X = [1, 2, 3, 4, 5]
loo = LeaveOneOut()
for train, test in loo.split(X):
    print("%s %s" % (train, test))

[1 2 3 4] [0]
[0 2 3 4] [1]
[0 1 3 4] [2]
[0 1 2 4] [3]
[0 1 2 3] [4]


#### K-Fold

Les éléments de jeu de données sont répartis dans k groupes. 
L'algorithme va choisir successivement un groupe qui sera utilisé pour le test,tandis que les autres seront utilisés pour l'entraînement.

In [19]:
 from sklearn.model_selection import GroupKFold

X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]
groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]

gkf = GroupKFold(n_splits=3)
for train, test in gkf.split(X, y, groups=groups):
     print("%s %s" % (train, test))

[0 1 2 3 4 5] [6 7 8 9]
[0 1 2 6 7 8 9] [3 4 5]
[3 4 5 6 7 8 9] [0 1 2]


#### Stratifield-K-Fold

Le but est d'appliquer le l'algorithme k-fold tout en respectant la répartition (proportion) du nombres d'éléments de chaque classe
Par exemple pour 2 classes de respectivement 10 éléments et 4 éléments, les groupes de k-folds auront une répartition de 10/14 et 4/14

In [27]:
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
skf = StratifiedKFold(n_splits=3)

print("Method Stratifield-K-Fold :\n")
for train, test in skf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))
print("\nMethod K fold :\n")    
kf = KFold(n_splits=3)
for train, test in kf.split(X, y):
    print('train -  {}   |   test -  {}'.format(
        np.bincount(y[train]), np.bincount(y[test])))

Method Stratifield-K-Fold :

train -  [30  3]   |   test -  [15  2]
train -  [30  3]   |   test -  [15  2]
train -  [30  4]   |   test -  [15  1]

Method K fold :

train -  [28  5]   |   test -  [17]
train -  [28  5]   |   test -  [17]
train -  [34]   |   test -  [11  5]


## Méthode des k plus proches voisins

In [63]:
import sklearn.datasets
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


iris = sklearn.datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50)
clf = KNeighborsClassifier(n_neighbors = 3)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print(preds)
print(y_test, "\n")

print(skm.accuracy_score(y_test, preds))
print(skm.confusion_matrix(y_test, preds))
print(skm.precision_score(y_test, preds, average=None))
print(skm.recall_score(y_test, preds, average=None))

[0 0 2 0 2 1 2 1 0 2 2 2 0 0 1 2 2 1 0 0 1 2 0 2 2 1 1 0 2 1 2 1 2 0 1 1 2
 2 1 2 0 0 1 0 2 0 2 0 1 0 1 2 0 1 2 2 0 0 2 1 2 0 0 1 1 0 1 2 2 1 1 0 1 1
 0]
[0 0 2 0 2 1 2 1 0 2 1 2 0 0 1 2 2 1 0 0 1 2 0 2 2 1 1 0 2 1 2 1 2 0 1 1 2
 2 1 2 0 0 1 0 1 0 2 0 1 0 1 2 0 1 2 2 0 0 2 1 2 0 0 1 1 0 1 2 1 1 1 0 1 1
 0] 

0.96
[[25  0  0]
 [ 0 24  3]
 [ 0  0 23]]
[1.         1.         0.88461538]
[1.         0.88888889 1.        ]


## Classifieur bayésien naïf

In [22]:
import sklearn.datasets
from sklearn.model_selection import LeaveOneOut
from sklearn.naive_bayes import GaussianNB
import sklearn.metrics as skm

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

wine = sklearn.datasets.load_wine()
wineX, wineY = wine.data, wine.target

loo = LeaveOneOut()
total = 0
for train, test in loo.split(wineX):
    gnb = GaussianNB()
    trainWine = []
    targetWine = []
    for i in range(len(train)):
        trainWine.append(wineX[train[i]])
        targetWine.append(wineY[train[i]])
    
    gnb.fit(trainWine, targetWine)
    preds = gnb.predict([wineX[test[0]]])
    total = total + skm.accuracy_score([wineY[test[0]]], preds)

print(total)
print(total / len(wineX))


X_train, X_test, y_train, y_test = train_test_split(wineX, wineY, test_size=0.50)
clf = KNeighborsClassifier(n_neighbors = 3)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

print(skm.accuracy_score(y_test, preds))

174.0
0.9775280898876404
0.7191011235955056
