# Projet Machine Learning – Wine Quality (Vin Rouge)

Auteur : **Brahim Semlali**

Ce notebook implémente **tout le cahier des charges** sur le dataset Wine Quality (vin rouge) :

- Chargement & exploration des données (EDA)
- Pré‑traitement & préparation des features
- Réduction de dimension : **PCA**, **t‑SNE**, **NMF**
- Clustering : **K‑Means**, **Agglomerative Clustering**, **DBSCAN**
- Classification : **Logistic Regression, KNN, Decision Tree, SVM, Random Forest, Gradient Boosting**
- Suivi des expériences avec **MLflow**

> Tous les algorithmes sont implémentés **dans ce seul notebook `.ipynb`**, comme demandé.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.decomposition import PCA, NMF
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import mlflow
import mlflow.sklearn

mlflow.set_experiment("wine-quality-project")

## 1. Chargement du dataset & EDA

In [None]:
# Chargement du dataset depuis le dossier dataset/
df = pd.read_csv("dataset/winequality-red.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(14, 10))
df.hist(bins=20, figsize=(14, 10))
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
corr = df.corr(numeric_only=True)
sns.heatmap(corr, annot=False, cmap="coolwarm", linewidths=0.5)
plt.title("Matrice de corrélation")
plt.tight_layout()
plt.show()

## 2. Pré‑traitement & préparation des données

In [None]:
# Création d'une cible binaire : 1 si quality >= 6, sinon 0
df["quality_binary"] = (df["quality"] >= 6).astype(int)

# Séparation features / target
X = df.drop(columns=["quality", "quality_binary"])
y = df["quality_binary"]

# Normalisation StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

## 3. Réduction de dimension : PCA, t‑SNE, NMF

In [None]:
# PCA 2 composantes
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(6, 5))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df["quality"], cmap="viridis", s=10)
plt.colorbar(scatter, label="quality")
plt.title("PCA (2 composantes) – Wine Quality")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.tight_layout()
plt.show()

In [None]:
# t‑SNE 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=30, init="random", learning_rate="auto")
X_tsne = tsne.fit_transform(X_scaled)

plt.figure(figsize=(6, 5))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=df["quality"], cmap="viridis", s=10)
plt.colorbar(scatter, label="quality")
plt.title("t-SNE – Wine Quality")
plt.tight_layout()
plt.show()

In [None]:
# NMF (données positifs via MinMaxScaler)
mm_scaler = MinMaxScaler()
X_pos = mm_scaler.fit_transform(X)

nmf = NMF(n_components=2, random_state=42)
X_nmf = nmf.fit_transform(X_pos)

plt.figure(figsize=(6, 5))
scatter = plt.scatter(X_nmf[:, 0], X_nmf[:, 1], c=df["quality"], cmap="viridis", s=10)
plt.colorbar(scatter, label="quality")
plt.title("NMF (2 composantes) – Wine Quality")
plt.tight_layout()
plt.show()

## 4. Clustering : K‑Means, Agglomerative, DBSCAN

In [None]:
def run_clustering_and_silhouette(model, X, name):
    labels = model.fit_predict(X)
    # Silhouette score défini seulement si au moins 2 clusters
    unique_labels = set(labels)
    if -1 in unique_labels:
        unique_labels.remove(-1)
    if len(unique_labels) < 2:
        score = -1.0
    else:
        score = silhouette_score(X, labels)
    print(f"{name} - silhouette score: {score:.3f}")
    return labels, score

# On peut faire le clustering sur les données PCA 2D pour visualiser
X_cluster = X_pca

# K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
labels_kmeans, sil_kmeans = run_clustering_and_silhouette(kmeans, X_cluster, "KMeans")

# Agglomerative
agg = AgglomerativeClustering(n_clusters=4)
labels_agg, sil_agg = run_clustering_and_silhouette(agg, X_cluster, "Agglomerative")

# DBSCAN
dbscan = DBSCAN(eps=0.7, min_samples=5)
labels_dbscan, sil_dbscan = run_clustering_and_silhouette(dbscan, X_cluster, "DBSCAN")

In [None]:
plt.figure(figsize=(15, 4))

for i, (labels, title) in enumerate([
    (labels_kmeans, "K-Means"),
    (labels_agg, "Agglomerative"),
    (labels_dbscan, "DBSCAN"),
]):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X_cluster[:, 0], X_cluster[:, 1], c=labels, cmap="tab10", s=10)
    plt.title(title)
    plt.xlabel("Component 1")
    plt.ylabel("Component 2")

plt.tight_layout()
plt.show()

## 5. Classification (binaire) + suivi MLflow

In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
}

In [None]:
def train_and_log_model(name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=name):
        # Entraînement
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Métriques
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="binary")

        print(f"{name} - Accuracy: {acc:.3f}, F1: {f1:.3f}")
        print(classification_report(y_test, y_pred))

        # Log params / metrics
        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # Sauvegarde du modèle
        mlflow.sklearn.log_model(model, artifact_path="model")

In [None]:
for name, model in models.items():
    train_and_log_model(name, model, X_train, X_test, y_train, y_test)

## 6. Conclusion

- Nous avons appliqué la réduction de dimension (PCA, t‑SNE, NMF) sur le dataset Wine Quality.
- Nous avons exploré plusieurs méthodes de clustering (K‑Means, Agglomerative, DBSCAN) et comparé leurs silhouettes scores.
- Nous avons entraîné et évalué plusieurs modèles de classification : Logistic Regression, KNN, Decision Tree, SVM, Random Forest, Gradient Boosting.
- Toutes les expériences sont suivies avec **MLflow** (`mlflow ui`) pour comparer les modèles.

## 5 bis. Classification multi‑classe (quality 3–8)

In [None]:
# Target multi‑classe basée sur la note brute de qualité

y_multi = df["quality"]

X_train_mc, X_test_mc, y_train_mc, y_test_mc = train_test_split(
    X_scaled, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

print("Répartition des classes (train) :")
print(y_train_mc.value_counts().sort_index())

In [None]:
# Même dictionnaire de modèles que pour la binaire
models_multiclass = models

In [None]:
def train_and_log_model_multiclass(name, model, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=name + "_multiclass"):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        print(f"{name} (multi‑classe) - Accuracy: {acc:.3f}, F1-macro: {f1:.3f}")
        print(classification_report(y_test, y_pred))

        if hasattr(model, "get_params"):
            mlflow.log_params(model.get_params())

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_macro", f1)

        mlflow.sklearn.log_model(model, artifact_path="model_multiclass")

In [None]:
for name, model in models_multiclass.items():
    train_and_log_model_multiclass(name, model, X_train_mc, X_test_mc, y_train_mc, y_test_mc)

## 4 bis. Dendrogramme pour l'Agglomerative Clustering

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Pour lisibilité, on prend un sous‑échantillon de 200 points
X_sample = X_scaled[:200]
Z = linkage(X_sample, method="ward")

plt.figure(figsize=(12, 5))
dendrogram(Z, truncate_mode="lastp", p=30, leaf_rotation=90.)
plt.title("Dendrogramme (Agglomerative, échantillon de 200 vins)")
plt.xlabel("Groupes")
plt.ylabel("Distance")
plt.tight_layout()
plt.show()