In [1]:
import os
import logging
from datetime import datetime
from pymongo import MongoClient
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Environnement
def setup_environment():
    for folder in ["logs", "images"]:
        os.makedirs(folder, exist_ok=True)
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(f"logs/clustering_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log", encoding="utf-8"),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

logger = setup_environment()
logger.info("Début du pipeline de clustering MiniBatch K-Means")

try:
    # 2. Connexion MongoDB & Chargement des données ACP
    client = MongoClient("localhost", 27017)
    db = client["IF29_twitter_db"]
    collection = db["acp_db"]
    
    # Charger uniquement les composantes principales
    fields = {"_id": 1, "PC1": 1, "PC2": 1, "PC3": 1, "PC4": 1, "PC5": 1}
    data = pd.DataFrame(list(collection.find({}, fields)))
    
    if data.empty:
        raise ValueError("Aucune donnée trouvée dans 'acp_db'")

    id_list = data.pop("_id")
    logger.info(f"{len(data)} vecteurs ACP chargés")

    # 3. Sous-échantillonnage pour recherche de k optimal
    sample_frac = 0.2 if len(data) > 10000 else 1.0
    data_sample = data.sample(frac=sample_frac, random_state=42)
    logger.info(f"Échantillon pour évaluation silhouette : {len(data_sample)} lignes")

    # 4. Détermination du meilleur k (score silhouette)
    silhouette_scores = []
    k_range = range(2, 11)

    for k in k_range:
        model = MiniBatchKMeans(
            n_clusters=k,
            random_state=42,
            batch_size=2048,
            n_init="auto",
            max_no_improvement=20
        )
        labels = model.fit_predict(data_sample)
        score = silhouette_score(data_sample, labels)
        silhouette_scores.append(score)
        logger.info(f"   k={k} | silhouette = {score:.4f}")

    best_k = k_range[silhouette_scores.index(max(silhouette_scores))]
    logger.info(f"Meilleur k trouvé : {best_k}")

    # 5. Clustering complet sur tous les points
    final_model = MiniBatchKMeans(
        n_clusters=best_k,
        random_state=42,
        batch_size=2048,
        n_init="auto",
        max_no_improvement=20
    )
    data["cluster"] = final_model.fit_predict(data)
    data.insert(0, "_id", id_list)

    # 6. Export MongoDB
    db.drop_collection("acp_clusters")
    db["acp_clusters"].insert_many(data.to_dict("records"))
    logger.info("Données clusterisées exportées dans 'acp_clusters'")

    # 7. Visualisation (PC1 vs PC2)
    plt.figure(figsize=(10, 7))
    sns.scatterplot(data=data, x="PC1", y="PC2", hue="cluster", palette="tab10", s=50, alpha=0.8)
    plt.title(f"Clustering MiniBatch K-Means (k={best_k})")
    plt.xlabel("Composante Principale 1")
    plt.ylabel("Composante Principale 2")
    plt.legend(title="Cluster")
    plt.tight_layout()
    plt.savefig("images/clusters_acp_pc1_pc2.png")
    plt.close()
    logger.info("Visualisation sauvegardée : clusters_acp_pc1_pc2.png")

except Exception as e:
    logger.error(f"Erreur dans le pipeline : {str(e)}")
    raise

finally:
    if "client" in locals():
        client.close()
        logger.info("Connexion MongoDB fermée")

logger.info("Pipeline MiniBatch K-Means terminé avec succès")


2025-06-09 19:48:44,422 - INFO - Début du pipeline de clustering MiniBatch K-Means
2025-06-09 19:48:45,996 - INFO - 267468 vecteurs ACP chargés
2025-06-09 19:48:46,006 - INFO - Échantillon pour évaluation silhouette : 53494 lignes
2025-06-09 19:49:13,730 - INFO -    k=2 | silhouette = 0.4588
2025-06-09 19:49:40,193 - INFO -    k=3 | silhouette = 0.3731
2025-06-09 19:50:05,947 - INFO -    k=4 | silhouette = 0.3222
2025-06-09 19:50:31,234 - INFO -    k=5 | silhouette = 0.3074
2025-06-09 19:50:56,346 - INFO -    k=6 | silhouette = 0.2722
2025-06-09 19:51:22,396 - INFO -    k=7 | silhouette = 0.3016
2025-06-09 19:51:47,518 - INFO -    k=8 | silhouette = 0.2688
2025-06-09 19:52:12,571 - INFO -    k=9 | silhouette = 0.2588
2025-06-09 19:52:37,541 - INFO -    k=10 | silhouette = 0.2617
2025-06-09 19:52:37,542 - INFO - Meilleur k trouvé : 2
2025-06-09 19:52:40,000 - INFO - Données clusterisées exportées dans 'acp_clusters'
2025-06-09 19:52:50,261 - INFO - Visualisation sauvegardée : clusters_a