# Introduction au Machine Learning 

Ce notebook permet d'introduire les principales méthodes de regroupement (clustering) et de classification utilisés dans le Machine Learning. 
Pour cela, nous allons utiliser la bibliothèque Scikit-learn de Python.

Pour en savoir plus et consulter la documentation très détaillée de cette bibliothèque : https://scikit-learn.org/stable/index.html#

## Méthode de clustering 

Ces méthodes sont des méthodes non-supervisées, c'est-à-dire qu'il n'existe pas à priori d'étiquettes associées à chaque valeur. 



### KMEANS

In [None]:
import matplotlib.pyplot as plt 
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [None]:
X,y = make_blobs(n_samples = 250, n_features = 2, centers = 3, cluster_std = 0.5, shuffle=True, random_state = 0)

In [None]:
plt.scatter(X[:,0],X[:,1], c= 'white', marker = 'o', edgecolor = 'black', s = 50)
plt.grid()
plt.show()

In [None]:
km = KMeans(n_clusters=3,init = 'random', max_iter = 300,tol = 1e-04,random_state=0)
y_km = km.fit_predict(X)

In [None]:
plt.scatter(X[y_km == 0,0],X[y_km == 0,1], s=50, c='lightgreen', marker = 's', edgecolor = 'black',label= 'cluster 1')
plt.scatter(X[y_km == 1,0],X[y_km == 1,1], s=50, c='orange', marker = 'o', edgecolor = 'black',label= 'cluster 2')
plt.scatter(X[y_km == 2,0],X[y_km == 2,1], s=50, c='lightblue', marker = 'v', edgecolor = 'black',label= 'cluster 3')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1], s = 250, marker = '*', c = 'red', edgecolor = 'black', label = 'centroids')
plt.legend(scatterpoints=1)
plt.grid()
    

In [None]:
from sklearn.datasets import make_moons
X,y = make_moons(n_samples = 200, noise = 0.05, random_state = 0)
plt.scatter(X[:,0],X[:,1],c= 'white', marker = 'o', edgecolor = 'black', s = 50)
plt.grid()
plt.show()

In [None]:
km = KMeans(n_clusters=2,init = 'random', max_iter = 300,tol = 1e-04,random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0,0],X[y_km == 0,1], s=50, c='lightgreen', marker = 's', edgecolor = 'black',label= 'cluster 1')
plt.scatter(X[y_km == 1,0],X[y_km == 1,1], s=50, c='orange', marker = 'o', edgecolor = 'black',label= 'cluster 2')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1], s = 250, marker = '*', c = 'red', edgecolor = 'black', label = 'centroids')
plt.legend(scatterpoints=1)
plt.grid()

### DBSCAN 

Donner la référence de l'article associé 

In [None]:
from sklearn.cluster import DBSCAN 
db = DBSCAN(eps = 0.2, min_samples=5, metric = 'euclidean')
y_db = db.fit_predict(X)

plt.scatter(X[y_db == 0,0], X[y_db == 0,1], c = 'lightgreen', edgecolor = 'black', marker = 's', s = 50, label = 'cluster 1')
plt.scatter(X[y_db == 1,0], X[y_db == 1,1], c = 'orange', edgecolor = 'black', marker = 'o', s = 50, label = 'cluster 2')
plt.legend(scatterpoints=1)
plt.grid()
plt.show()



## Méthode de régression 

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

X = np.array([258.0,270.0,294.0,320.0,342.0,368.0,396.0,446.0,480.0,586.0]).reshape(-1,1)
y = np.array([236.4,234.4,252.8,298.6,314.2,342.2,360.8,368.0,391.2,390.8]).reshape(-1,1)

lr = LinearRegression()
pr = LinearRegression()
quadratic = PolynomialFeatures(degree=12)
X_quad = quadratic.fit_transform(X)
lr.fit(X,y)
X_fit = np.arange(250,600,2).reshape(-1,1)
y_lin_fit = lr.predict(X_fit)
pr.fit(X_quad,y)
y_quad_fit = pr.predict(quadratic.fit_transform(X_fit))

plt.scatter(X,y,label = 'training points')
plt.plot(X_fit,y_lin_fit, label = 'linear fit', linestyle = '--')
plt.plot(X_fit,y_quad_fit,label = 'quadratic fit')
plt.legend(loc = 'upper left')
plt.show()

y_lin_pred = lr.predict(X)
y_quad_pred = pr.predict(X_quad)

R1 = mean_squared_error(y,y_lin_pred)
R2 = mean_squared_error(y,y_quad_pred)

print(R1)
print(R2)

## Les méthodes supervisées

Technique d'apprentissage automatique consistant à apprendre une fonction de prédiction à partir d'exemples annotés.

### Méthode à Support Vecteur Machine (SVM)

Trouver une transformation permettant de linéariser le problème 
The distance between the support vectors and the hyperplane are as far as possible

In [None]:
from sklearn import svm

X,y = make_blobs(n_samples = 250, n_features = 2, centers = 2, cluster_std = 0.5, shuffle=True, random_state = 0)
plt.scatter(X[:,0],X[:,1], c= 'white', marker = 'o', edgecolor = 'black', s = 50)
plt.grid()
km = KMeans(n_clusters=2,init = 'random', max_iter = 300,tol = 1e-04,random_state=0)
y_km = km.fit_predict(X)
plt.scatter(X[y_km == 0,0],X[y_km == 0,1], s=50, c='lightgreen', marker = 's', edgecolor = 'black',label= 'cluster 1')
plt.scatter(X[y_km == 1,0],X[y_km == 1,1], s=50, c='orange', marker = 'o', edgecolor = 'black',label= 'cluster 2')
plt.legend(scatterpoints=1)
plt.grid()

model = svm.SVC(kernel='linear')
model.fit(X,y)

w = model.coef_[0]
a = -w[0]/w[1]
xx = np.linspace(-1,4)
yy = a*xx - (model.intercept_[0])/w[1]
b = model.support_vectors_[0]
yy_down = a * xx + (b[1] - a*b[0])
b = model.support_vectors_[-1]
yy_up = a * xx + (b[1] - a * b[0]) 

plt.plot(xx,yy,linewidth = 2, color='black')
plt.plot(xx,yy_down, 'k--')
plt.plot(xx,yy_up,'k--')
plt.scatter(model.support_vectors_[:,0], model.support_vectors_[:,1], s=80, facecolors = 'none')



In [None]:
from sklearn.datasets import make_circles
X,y = make_circles(n_samples=600, shuffle=True, noise=0.03, random_state=None, factor=0.6)
plt.scatter(X[:,0],X[:,1],c= 'white', marker = 'o', edgecolor = 'black', s = 50)
plt.grid()
plt.show()

In [None]:
#model = svm.SVC(kernel='rbf', C= 1, gamma = 2**-5)
#model.fit(X,y)


###  k plus proche voisins (kNN) 

## Extraction de caractéristiques   

### Réduction de la dimension 

Analyse en composante principale (ACP) 

