# Description
# Jeux de données :
- `wine` pour la classification
- `diabetes` pour la régression

## Comparer

- Sans normalisation
- Avec `StandarScaler`
- Avec `MinMaxScaler`

## Les modèles

- Gaussian Processes
- Generalized Linear Models : Rigde et logistique
- Naive Bayes : Gaussian
- KNeighbors
- SVM
- Decision Trees

## Déterminer les hypeparamètres

via

In [1]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from sklearn.datasets import load_wine

In [5]:
wine = load_wine()

In [6]:
dir(wine)

['DESCR', 'data', 'feature_names', 'target', 'target_names']

In [7]:
wine.data

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [13]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [151]:
from sklearn.datasets import load_diabetes

In [152]:
diabetes = load_diabetes()

In [153]:
dir(diabetes)

['DESCR',
 'data',
 'data_filename',
 'feature_names',
 'target',
 'target_filename']

In [154]:
diabetes.data

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]])

## Classification

## Chargement des données, test/train

In [155]:
X_e, X_t, y_e, y_t = train_test_split(wine.data, wine.target)
print(X_e.shape, y_e.shape)

(133, 13) (133,)


## Normalisation

In [156]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [157]:
ss = StandardScaler()
ss.fit(X_e)
X_ess = ss.transform(X_e)
X_tss = ss.transform(X_t)

In [158]:
mm = MinMaxScaler()
mm.fit(X_e)
X_emm = mm.transform(X_e)
X_tmm = mm.transform(X_t)

## Sélection des modèles

In [159]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

## Régression Logistique

In [160]:
from sklearn.linear_model import LogisticRegression

In [161]:
LogisticRegression

sklearn.linear_model.logistic.LogisticRegression

In [162]:
gs_lr = GridSearchCV(LogisticRegression(),
            {
                "C" : [2 ** n for n in range(-5, 10)],
                "penalty" : ["l1", "l2"],
                "max_iter" : [1000],
                "solver" : ["liblinear"],
                "multi_class" : ["auto"],
            },
                    cv=5)

In [163]:
gs_lr.fit(X_e, y_e)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.03125, 0.0625, 0.125, 0.25, 0.5, 1, 2, 4, 8,
                               16, 32, 64, 128, 256, 512],
                         'max_iter': [1000], 'multi_class': ['auto'],
                         'penalty': ['l1', 'l2'], 'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_tra

In [164]:
print(gs_lr.best_params_)

{'C': 256, 'max_iter': 1000, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear'}


In [165]:
lr_brut = LogisticRegression(C=32, penalty="l1",
                            solver="liblinear",
                            multi_class="auto")

lr_brut.fit(X_e, y_e)

LogisticRegression(C=32, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [166]:
gs_lr = GridSearchCV(LogisticRegression(),
            {
                "C" : [2 ** n for n in range(-5, 10)],
                "penalty" : ["l1", "l2"],
                "max_iter" : [1000],
                "solver" : ["liblinear"],
                "multi_class" : ["auto"],
            },
                    cv=5)
gs_lr.fit(X_ess, y_e)
print(gs_lr.best_params_)

{'C': 256, 'max_iter': 1000, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear'}




In [167]:
gs_lr = GridSearchCV(LogisticRegression(),
            {
                "C" : [2 ** n for n in range(-5, 10)],
                "penalty" : ["l1", "l2"],
                "max_iter" : [1000],
                "solver" : ["liblinear"],
                "multi_class" : ["auto"],
            },
                    cv=5)
gs_lr.fit(X_emm, y_e)
print(gs_lr.best_params_)

{'C': 128, 'max_iter': 1000, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear'}




In [168]:
lr_mm = LogisticRegression(**gs_lr.best_params_)
lr_mm.fit(X_emm, y_e)

LogisticRegression(C=128, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

## Comparaison des trois

In [169]:
from sklearn.metrics import confusion_matrix

##  Matrice de confusion sur les tests

In [170]:
confusion_matrix(y_t, lr_brut.predict(X_t))

array([[14,  2,  0],
       [ 0, 17,  1],
       [ 0,  0, 11]], dtype=int64)

In [171]:
confusion_matrix(y_t, lr_ss.predict(X_tss))

NameError: name 'lr_ss' is not defined

In [None]:
confusion_matrix(y_t, lr_mm.predict(X_tmm))

## Matrice de confusion sur tout

In [None]:
confusion_matrix(wine.target, lr_brut.predict(wine.data))

In [None]:
confusion_matrix(wine.target, lr_ss.predict(ss.transform(wine.data)))

In [None]:
confusion_matrix(wine.target, lr_mm.predict(mm.transform(wine.data)))

### SVM

In [None]:
from sklearn.svm import SVC

In [None]:
gs_svm = GridSearchCV(
    SVC(),
    {
        "kernel" : ["poly", "rbf", "sigmoid"],
        "C" : [2 ** n for n in range(-10, 5)],
        "gamma" : ["auto"]
    },
    cv = 5,
)

gs_svm.fit(X_e, y_e)
print(gs_svm.best_params_)

In [None]:
svm_brut = SVC(**gs_svm.best_params_)
svm_brut

In [None]:
gs_svm = GridSearchCV(
    SVC(),
    {
        "kernel" : ["poly", "rbf", "sigmoid"],
        "C" : [2 ** n for n in range(-10, 5)],
        "gamma" : ["auto"]
    },
    cv = 5,
)

gs_svm.fit(X_ss, y_e)
print(gs_svm.best_params_)
svm_ss = SVC(**gs_svm.best_params_)
svm_ss

In [None]:
gs_svm = GridSearchCV(
    SVC(),
    {
        "kernel" : ["poly", "rbf", "sigmoid"],
        "C" : [2 ** n for n in range(-10, 5)],
        "gamma" : ["auto"]
    },
    cv = 5,
)

gs_svm.fit(X_mm, y_e)
print(gs_svm.best_params_)
svm_mm = SVC(**gs_svm.best_params_)
svm_mm

### Matrice de confusion

In [None]:
confusion_matrix(wine.target, svm_brut.predict(wine.data))

In [None]:
confusion_matrix(wine.target, svm_ss.predict(ss.transform(wine.data)))

In [None]:
confusion_matrix(wine.target, svm_mm.predict(mm.transform(wine.data)))