In [None]:
from IPython.display import *

# Setup Tensorflow
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.activations import *
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import *

# Setup numpy, pandas
import pandas as pd
import numpy as np

# Setup matplotlib
import matplotlib
import matplotlib.pyplot as plt
DEFAULT_W, DEFAULT_H = (16, 9)
matplotlib.rcParams["figure.figsize"] = [DEFAULT_W, DEFAULT_H]
matplotlib.rcParams["font.size"] = 15
matplotlib.rcParams['figure.dpi'] = 100

# Load data
FILE_NAME_RED = "winequality-red.csv"
FILE_NAME_WHITE = "winequality_white.csv"
Y_COL_NAME = "quality"

# Merge for both datasets (red wines, white wines)
data_red = pd.read_csv(FILE_NAME_RED, sep=",")
data_red['type']='red'
data_white = pd.read_csv(FILE_NAME_WHITE, sep=";")
data_white['type']='white'
data = pd.concat([data_red, data_white]).sample(frac=1).reset_index()


# I. Stats

### A. Distribution et informations statistiques de base

In [None]:
# Mean and Standard Pearson deviation

print(data.describe())
reds = data[data['type']=='red']
whites = data[data['type']=='white']

x_red, y_red = np.unique(reds['quality'], return_counts=True)
x_white, y_white = np.unique(whites['quality'], return_counts=True)
plt.figure(facecolor='#EAEEF5')

width=0.4
ax = plt.axes()
ax.set_facecolor('#EAEEF5')
plt.bar(x_red-0.2, y_red, width, color='red')
plt.bar(x_white+0.2, y_white, width, color='white')
plt.legend(['Vins rouges', 'Vins blancs'])
plt.title('Distribution de la qualité des vins en fonction du type (rouge ou blanc)')

# Boxplot for each variable grouped by the quality values, removing qualitative variable 'type' and the dataframe index
columns = data.columns.drop(['quality', 'type', 'index'])

for column in columns:
    plt.figure()
    data.boxplot(column=column, by=['quality', 'type'], grid=False)
    plt.title('Boîte à moustache de la variable "{}" en fonction du type de vin et de la note obtenue'.format(column))
    plt.show()


### B. Pearson Correlation

In [None]:
# TODO
# - distribution
# - (FAIT) matrice de correlation (pearson correlation)
# std, mean, boite moustache etc...
# distribution par note

corr = data.corr(method='pearson')
display(corr.style.background_gradient(cmap='coolwarm').set_precision(2))
# classement:
# top = abs(corr.loc[Y_COL_NAME]).sort_values(ascending=False)
top = corr.loc[Y_COL_NAME][corr.index != Y_COL_NAME]
sorted = abs(top).sort_values(ascending=False)
display(top) # en abs car -1 donne une bonne correlation aussi (correlation negative)
fig, ax = plt.subplots()
ax.bar(x=sorted.index, height=top[sorted.index], width=1, edgecolor='black')
ax.set_title("Classement des correlations des variables par rapport à la variable \"quality\"", fontdict={"size":25})
for i, v in enumerate(top[sorted.index].values):
    ax.text(i - 0.25, (v + np.sign(v) * 0.015) - 0.01, f"{round(v, 2):.2f}", color='black')
# plt.axhline(1, linestyle='--')
# plt.axhline(-1, linestyle='--')
plt.ylabel("Pearson correlation", fontweight='light', fontsize='x-large')
plt.xticks(rotation=67.5, horizontalalignment='right', fontweight='light', fontsize='large')
plt.yticks(fontweight='light', fontsize='small')
plt.show()

# II. Préparation des données

In [None]:
from sklearn.model_selection import train_test_split


# On ne considère pas la colonne type pour le moment qui est qualitative
X = data.drop([Y_COL_NAME, 'type'], axis=1)
# X = ((X - X.mean()) / X.std()).values # Standardization colonne par colonne # (x - µ) / σ
# ou
# Normalisation min-max
def min_max_scaling(X):
    return (X - X.min()) / (X.max() - X.min())

# Réduction et centrage des valeurs autour de la moyenne
def norm_scaling(X):
    return ((X - X.mean()) / X.std())

X = min_max_scaling(X)

Y = data.loc[:, Y_COL_NAME].values
NB_CLASS = len(np.unique(Y))
Y = Y.reshape(-1, 1) # Y est scale entre 0 et 1 au moment de faire la regression (on garde les notes en entier pour le model de classif)

NB_INPUT = X.shape[1] # nombre de variable en input des modeles
NB_DATA = X.shape[0]
# X, Y, X.shape, Y.shape

# Split train/test
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X, X_test, Y, Y_test = X.values, X_test.values, Y - 3, Y_test - 3
# Y_train = Y_train.reshape(-1)
# Y_test = Y_test.reshape(-1)

# III. Modèles & Entrainments

### 1. Définition du réseau de neurones

In [None]:
from tensorflow.keras.activations import *

def classification_model(DROP_RATE = 0.1, RELU_ALPHA = 0.2, N_NEURONES = 32, N_LAYERS = 1):
	
	input = Input(shape=(NB_INPUT,), name="input")
	x = LayerNormalization()(input)

	for _ in range(N_LAYERS):
		x = LeakyReLU(RELU_ALPHA)(BatchNormalization()(Dense(N_NEURONES)(x)))
		x = Dropout(DROP_RATE)(x)
	
	output = Dense(NB_CLASS, activation=Softmax(), name="prediction")(x)
	
	return Model(input, output, name="classification_model")

# NOTE: Entrainement fait dans la CV plus bas
# epochs = 50
# lr = 1e-2
# batch_size = 32

# classModel = classification_model(DROP_RATE = 0.2, RELU_ALPHA = 0.2, N_NEURONES = 32, N_LAYERS = 3)
# classModel.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"], optimizer=Adam(lr))
# classModel.summary()
#
# hclass = classModel.fit(X, Y, validation_data=[X_test, Y_test], batch_size=batch_size, epochs=epochs)

# lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 / 10**(epoch/50) ) # 
# hclass = classModel.fit(X_train, Y_train, validation_data=[X_test, Y_test], batch_size=32, epochs=200, callbacks=[lr_schedule])

### 2. Définition des modèles de régression/classification et de la fonction d'entraîntement par KFold validation

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix
import seaborn as sn

def logistic_regression_model(random_state=0, multi_class='ovr', solver='liblinear', max_iter=100):
    return LogisticRegression(random_state=random_state, multi_class=multi_class, solver=solver, max_iter=max_iter)

def lasso_regression_model(random_state=0):
    return Lasso(random_state=random_state)

def ridge_classification_model():
    return RidgeClassifier()

def QDA_model():
    return QuadraticDiscriminantAnalysis()

def SVM_model(kernel='rbf', C=1, random_state=0, probability=False):
    return SVC(kernel=kernel, C=C, random_state=random_state, probability=probability)

def apply_model(model, X_train, X_test, Y_train, Y_test):
	model.fit(X_train, Y_train)
	print('Précision du modèle : %0.2f' % model.score(X_test, Y_test))
	return model
    
def apply_model_cv(model, X_train, Y_train, cv):
	scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring='accuracy')
	print(f"""
	accuracy: {np.round(scores*100, 2)}
	mean: {np.mean(scores*100):.2f}%
	ecart-type: {np.std(scores*100):.2f}%
	""")
        
         
# ######
# import sklearn
# sklearn.metrics.SCORERS.keys()

cv = KFold(n_splits=5, shuffle=True, random_state=1)


### 3. Application des modèles et résultats

#### a. Modèle de régression logistique multinomiale

In [None]:
print('\n*** Test du modèle de régression logistique multinomiale avec CV ***\n')
logRegModel = logistic_regression_model(0, 'multinomial', 'lbfgs', 2000)
apply_model_cv(logRegModel, X, Y.reshape(-1,), cv) # score cv
logRegModel.fit(X, Y.reshape(-1,)) # train sur toute la base
# Y_pred = logRegModel.predict(X_test)
Y_pred = logRegModel.predict_proba(X_test) # shape => (-1, NB_CLASS)
cce = np.mean(tf.keras.metrics.sparse_categorical_crossentropy(Y_test, Y_pred)) # Permet de comparer l'erreur du model à celui du deep
print(f"""Categorical Crossentropy Error: {cce}""")

#### b. Modèle de classification Ridge

In [None]:

print('\n*** Test du modèle ridge avec CV ***\n')
ridgeClf = ridge_classification_model()
apply_model_cv(ridgeClf, X, Y.reshape(-1,), cv) # score cv
ridgeClf.fit(X, Y.reshape(-1,)) # train sur toute la base
Y_pred = ridgeClf.predict(X_test)

cm = confusion_matrix(Y_test + 3, Y_pred + 3, labels=np.arange(3, 10))
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='g')
plt.show()
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='.2f')

In [None]:
# print('\n*** Test du modèle de régression logistique multinomiale avec CV ***\n')
# # lassoRegModel = lasso_regression_model()
# lassoRegModel = Lasso(alpha=0.6, selection="random", positive = True)
# lassoRegModel.fit(X, Y.reshape(-1,)) # train sur toute la base
# # lassoRegModel.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) # train sur toute la base
# apply_model_cv(lassoRegModel, X, Y, cv) # score cv
# Y_pred = lassoRegModel.predict(X_test)
# # Y_pred = lassoRegModel.predict_proba(X_test) # shape => (-1, NB_CLASS)
# # cce = np.mean(tf.keras.metrics.sparse_categorical_crossentropy(Y_test, Y_pred)) # Permet de comparer l'erreur du model à celui du deep
# # print(f"""Categorical Crossentropy Error: {cce}""")

#### c. Modèle de régression SVM

In [None]:
print('\n*** Test du modèle de régression logistique multinomiale avec CV ***\n')
svmModel = SVM_model(probability=True)
apply_model_cv(svmModel, X, Y.reshape(-1,), cv) # score cv
svmModel.fit(X, Y.reshape(-1,)) # train sur toute la base
# Y_pred = logRegModel.predict(X_test)
Y_pred = logRegModel.predict_proba(X_test) # shape => (-1, NB_CLASS)
cce = np.mean(tf.keras.metrics.sparse_categorical_crossentropy(Y_test, Y_pred)) # Permet de comparer l'erreur du model à celui du deep
print(f"""Categorical Crossentropy Error: {cce}""")

#### d. Réseau de neurones

In [None]:
epochs = 10
lr = 1e-2
batch_size = 32

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

hclass = []
for fi, (train_i, val_i) in enumerate(cv.split(X, Y)):
	x_train, y_train = X[train_i], Y[train_i]
	x_val, y_val = X[val_i], Y[val_i]

	classModel = classification_model(DROP_RATE = 0.2, RELU_ALPHA = 0.2, N_NEURONES = 32, N_LAYERS = 3)
	classModel.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"], optimizer=Adam(lr))
	hclass.append(classModel.fit(x_train, y_train, validation_data=[x_val, y_val], batch_size=batch_size, epochs=epochs))


In [None]:
def get_metrics(hist, name="loss"):
	return np.array([h.history[name] for h in hist])
loss = get_metrics(hclass, "loss")
val_loss = get_metrics(hclass, "val_loss")
# plt.plot(np.mean(loss, axis=0))
# plt.plot(np.mean(val_loss, axis=0))
plt.errorbar(x=range(loss.shape[1]), y=np.mean(loss, axis=0), yerr=np.std(loss, axis=0), label="loss", fmt='o-')
plt.errorbar(x=range(val_loss.shape[1]), y=np.mean(val_loss, axis=0), yerr=np.std(val_loss, axis=0), label="val_loss", fmt='o-')
plt.title("loss vs val_loss, CV: 5 folds")
plt.legend()
plt.show()

In [None]:

print('\n*** Test de l\'analyse discriminante quadratique ***\n')
qdaClf = QDA_model()
apply_model(qdaClf, X, X_test, Y, Y_test)

In [None]:
# plt.plot(np.array(hclass.history["val_loss"]) / np.array(hclass.history["loss"]), label="ratio test/train loss")
# plt.axhline(y=1, color='b', linestyle='--', label="Equilibre")
# plt.legend()
# # Pour evaluer le surapprentissage (plus on s'eloigne de 1 en positif plus on est en surapprentisssage)

In [None]:
Y_pred = np.argmax(classModel.predict(X), axis=1)
Y_true = Y
cm = confusion_matrix(Y_true, Y_pred, labels=np.arange(0, 10))
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='g')
plt.show()
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='.2f')

In [None]:
Y_pred = np.argmax(classModel.predict(X_test), axis=1)
Y_true = Y_test
cm = confusion_matrix(Y_true, Y_pred, labels=np.arange(0, 10))
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='g')
plt.show()
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='.2f')

# IV Evaluations

In [None]:
# TODO Cross Val pour test different meta parametre
# plot les resultats + leur std par epochs

# TODO tester l'accuracy de la classif en faisant apparaitre aussi le second choix du model
# possible aussi de regarder si les deux premier choix sont "proche" (cad si une prediction à 5 en choix 1 est suivit par un 4 ou 6 en choix 2 (faisable en faaisant mean(abs(choix1-choix2)))))

# V Conclusion

In [None]:
# TODO Conclure
# TODO souligner les limites du dataset: que 3 votants, manque d'element en input comme le prix, les labels de qualité etc pour que le model soit pertinent dans le cadre d'une utilisation de classification de vin en situation de vente reel