In [None]:
from IPython.display import *

# Setup Tensorflow
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.activations import *
from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import *

# Setup numpy, pandas
import pandas as pd
import numpy as np

# Setup matplotlib
import matplotlib
import matplotlib.pyplot as plt
DEFAULT_W, DEFAULT_H = (16, 9)
matplotlib.rcParams["figure.figsize"] = [DEFAULT_W, DEFAULT_H]
matplotlib.rcParams["font.size"] = 15
matplotlib.rcParams['figure.dpi'] = 100

# Load data
FILE_NAME_RED = "winequality-red.csv"
FILE_NAME_WHITE = "winequality_white.csv"
Y_COL_NAME = "quality"

# Merge for both datasets (red wines, white wines)
data_red = pd.read_csv(FILE_NAME_RED, sep=",")
data_red['type']='red'
data_white = pd.read_csv(FILE_NAME_WHITE, sep=";")
data_white['type']='white'
data=pd.concat([data_red, data_white]).sample(frac=1).reset_index()


# I. Stats

### A. Distribution et informations statistiques de base

In [None]:
# Mean and Standard Pearson deviation

print(data.describe())
reds=data[data['type']=='red']
whites=data[data['type']=='white']

x_red, y_red=np.unique(reds['quality'], return_counts=True)
x_white, y_white=np.unique(whites['quality'], return_counts=True)
plt.figure(facecolor='#EAEEF5')

width=0.4
ax=plt.axes()
ax.set_facecolor('#EAEEF5')
plt.bar(x_red-0.2, y_red, width, color='red')
plt.bar(x_white+0.2, y_white, width, color='white')
plt.legend(['Vins rouges', 'Vins blancs'])
plt.title('Distribution de la qualité des vins en fonction du type (rouge ou blanc)')

# Boxplot for each variable grouped by the quality values
columns=data.columns.drop(['quality', 'type', 'index'])

for column in columns:
    plt.figure()
    data.boxplot(column=column, by=['quality', 'type'], grid=False)
    plt.title('Boîte à moustache de la variable "{}" en fonction du type de vin et de la note obtenue'.format(column))
    plt.show()


### B. Pearson Correlation

In [None]:
# TODO
# - distribution
# - (FAIT) matrice de correlation (pearson correlation)
# std, mean, boite moustache etc...
# distribution par note

corr = data.corr(method='pearson')
display(corr.style.background_gradient(cmap='coolwarm').set_precision(2))
# classement:
# top = abs(corr.loc[Y_COL_NAME]).sort_values(ascending=False)
top = corr.loc[Y_COL_NAME][corr.index != Y_COL_NAME]
sorted = abs(top).sort_values(ascending=False)
display(top) # en abs car -1 donne une bonne correlation aussi (correlation negative)
fig, ax = plt.subplots()
ax.bar(x=sorted.index, height=top[sorted.index], width=1, edgecolor='black')
ax.set_title("Classement des correlations des variables par rapport à la variable \"quality\"", fontdict={"size":25})
for i, v in enumerate(top[sorted.index].values):
    ax.text(i - 0.25, (v + np.sign(v) * 0.015) - 0.01, f"{round(v, 2):.2f}", color='black')
# plt.axhline(1, linestyle='--')
# plt.axhline(-1, linestyle='--')
plt.ylabel("Pearson correlation", fontweight='light', fontsize='x-large')
plt.xticks(rotation=67.5, horizontalalignment='right', fontweight='light', fontsize='large')
plt.yticks(fontweight='light', fontsize='small')
plt.show()

# II. Data preparation

In [None]:
NB_CLASS = 10

X = data.drop([Y_COL_NAME, 'type'], axis=1)
# X = ((X - X.mean()) / X.std()).values # Standardization colonne par colonne # (x - µ) / σ
# ou
# Normalisation min-max
def min_max_scaling(X):
    return (X - X.min()) / (X.max() - X.min())

# Réduction et centrage des valeurs autour de la moyenne
def norm_scaling(X):
    return ((X - X.mean()) / X.std())

X = min_max_scaling(X)

Y = data.loc[:, Y_COL_NAME].values
Y = Y.reshape(-1, 1) # Y est scale entre 0 et 1 au moment de faire la regression (on garde les notes en entier pour le model de classif)

NB_INPUT = X.shape[1] # nombre de variable en input des modeles
NB_DATA = X.shape[0]
# X, Y, X.shape, Y.shape

# Split train/test/validation
pTrain, pTest, pValidation = (0.80, 0.10, 0.10)
assert (pTrain + pTest + pValidation) == 1.0, f"La somme doit etre equel à 1, {pTrain, pTest, pValidation}"
SPLIT_TRAIN = int(NB_DATA * pTrain)
SPLIT_TEST = int(NB_DATA * pTest)
SPLIT_VAL = int(NB_DATA * pValidation)
print(SPLIT_TRAIN, SPLIT_TEST, SPLIT_VAL)
X_train, X_test, X_val = X[:SPLIT_TRAIN], X[SPLIT_TRAIN:SPLIT_TRAIN+SPLIT_TEST], X[-SPLIT_VAL:]
Y_train, Y_test, Y_val = Y[:SPLIT_TRAIN], Y[SPLIT_TRAIN:SPLIT_TRAIN+SPLIT_TEST], Y[-SPLIT_VAL:]
print(X_train.shape, X_test.shape, X_val.shape)
print(Y_train.shape, Y_test.shape, Y_val.shape)

# III. Modèles & Entrainments

In [None]:
from tensorflow.keras.activations import *

def classification_model(DROP_RATE = 0.1, RELU_ALPHA = 0.2, N_NEURONES = 32, N_LAYERS = 1):
	
	input = Input(shape=(NB_INPUT,), name="input")
	x = LayerNormalization()(input)

	for _ in range(N_LAYERS):
		x = LeakyReLU(RELU_ALPHA)(BatchNormalization()(Dense(N_NEURONES)(x)))
		x = Dropout(DROP_RATE)(x)
	
	output = Dense(NB_CLASS, activation=Softmax(), name="prediction")(x)
	
	return Model(input, output, name="classification_model")

epochs = 50
lr = 1e-2
batch_size = 32

classModel = classification_model(DROP_RATE = 0.2, RELU_ALPHA = 0.2, N_NEURONES = 32, N_LAYERS = 3)
classModel.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"], optimizer=Adam(lr))
classModel.summary()

hclass = classModel.fit(X_train, Y_train, validation_data=[X_test, Y_test], batch_size=batch_size, epochs=epochs)

# lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 / 10**(epoch/50) ) # 
# hclass = classModel.fit(X_train, Y_train, validation_data=[X_test, Y_test], batch_size=32, epochs=200, callbacks=[lr_schedule])

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier

def logistic_regression_model(random_state=0, 
                     multi_class='ovr', 
                     solver='liblinear', 
                     max_iter=100):
    
    return LogisticRegression(random_state=random_state, 
                              multi_class=multi_class, 
                              solver=solver, 
                              max_iter=max_iter)

def ridge_classification_model():
    return RidgeClassifier()


logRegModel=logistic_regression_model(0, 'multinomial', 'lbfgs', 1000)
logRegModel.fit(X_train, Y_train.reshape(-1))
print(logRegModel.predict_log_proba(X_test))
print(logRegModel.score(X_val, Y_val))

ridgeClf=ridge_classification_model()
ridgeClf.fit(X_train, Y_train.reshape(-1))
ridgeClf.score(X_val, Y_val)

In [None]:
plt.figure(figsize=(16,9))
plt.plot(hclass.history["loss"], label="train loss")
plt.plot(hclass.history["val_loss"], label="test loss")
plt.legend()
plt.show()

plt.figure(figsize=(16,9))
plt.plot(hclass.history["accuracy"], label="train accuracy")
plt.plot(hclass.history["val_accuracy"], label="test accuracy")
plt.legend()
plt.show()

In [None]:
plt.plot(np.array(hclass.history["val_loss"]) / np.array(hclass.history["loss"]), label="ratio test/train loss")
plt.axhline(y=1, color='b', linestyle='--', label="Equilibre")
plt.legend()
# Pour evaluer le surapprentissage (plus on s'eloigne de 1 en positif plus on est en surapprentisssage)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
Y_pred = np.argmax(classModel.predict(X_train), axis=1)
Y_true = Y_train.reshape(-1,)
cm = confusion_matrix(Y_true, Y_pred, labels=np.arange(0, 10))
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='g')
plt.show()
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='.2f')

In [None]:
Y_pred = np.argmax(classModel.predict(X_test), axis=1)
Y_true = Y_test.reshape(-1,)
cm = confusion_matrix(Y_true, Y_pred, labels=np.arange(0, 10))
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='g')
plt.show()
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm, annot=True, annot_kws={"size": 14}, fmt='.2f')

# IV Evaluations

In [None]:
# TODO Cross Val pour test different meta parametre
# plot les resultats + leur std par epochs

# TODO tester l'accuracy de la classif en faisant apparaitre aussi le seconde choix du model
# possible aussi de regarder si les deux premier choix sont "proche" (cad si une prediction à 5 en choix 1 est suivit par un 4 ou 6 en choix 2 (faisable en faaisant mean(abs(choix1-choix2)))))

# V Conclusion

In [None]:
# TODO Conclure
# TODO souligner les limites du dataset: que 3 votants, manque d'element en input comme le prix, les labels de qualité etc pour que le model soit pertinent dans le cadre d'une utilisation de classification de vin en situation de vente reel