In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import plotly.express as px


def generate_synthetic_data(n_samples=10000, random_state=20):
    """Generate synthetic HVAC data without stochastic resets.

    Parameters
    ----------
    n_samples : int
        Number of samples to generate.
    random_state : int
        Random seed for reproducibility.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns: temp_set, temp_current, mode, fan_speed, size.
    """
    rng = np.random.default_rng(random_state)

    temp_set = rng.normal(loc=24, scale=2, size=n_samples)
    temp_current = temp_set + rng.normal(loc=0, scale=0.5, size=n_samples)

    modes = rng.choice(["cool", "heat", "dry", "fan"], size=n_samples,
                        p=[0.5, 0.3, 0.1, 0.1])
    fan_speed = rng.choice(["low", "medium", "high"], size=n_samples,
                           p=[0.4, 0.4, 0.2])
    size = rng.choice(["small", "medium", "large"], size=n_samples,
                      p=[0.2, 0.5, 0.3])

    return pd.DataFrame({
        "temp_set": temp_set,
        "temp_current": temp_current,
        "mode": modes,
        "fan_speed": fan_speed,
        "size": size,
    })


def preprocess_features(df):
    """Encode categorical variables and scale numerical ones."""
    numeric_features = ["temp_set", "temp_current"]
    categorical_features = ["mode", "fan_speed", "size"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(sparse_output=False), categorical_features),
        ]
    )

    features = preprocessor.fit_transform(df)
    return features, preprocessor


def compute_silhouette(features, labels):
    """Safely compute silhouette score."""
    if len(np.unique(labels)) < 2:
        return -1.0
    return silhouette_score(features, labels)


def kmeans_clustering(features, min_k=2, max_k=160, random_state=42):
    """Run KMeans across a range of ``k`` and return the best clustering."""
    best_score = -1.0
    best_labels = None
    best_model = None
    best_k = min_k

    for k in range(min_k, max_k + 1):
        model = KMeans(n_clusters=k, n_init=10, random_state=random_state)
        labels = model.fit_predict(features)
        score = silhouette_score(features, labels)
        if score > best_score:
            best_score = score
            best_labels = labels
            best_model = model
            best_k = k

    return best_labels, best_model, best_k, best_score



def visualize_clusters(features, labels, title, output_file):
    pca = PCA(n_components=2, random_state=42)
    coords = pca.fit_transform(features)
    fig = px.scatter(
        x=coords[:, 0],
        y=coords[:, 1],
        color=labels.astype(str),
        labels={"x": "PC1", "y": "PC2"},
        title=title,
    )
    fig.write_html(output_file, auto_open=False)
if __name__ == "__main__":
    # Generate synthetic dataset
    df = generate_synthetic_data()

    # Preprocess for clustering
    features, _ = preprocess_features(df)

    # Unsupervised KMeans clustering with dynamic k
    (
        kmeans_labels,
        kmeans_model,
        best_k,
        kmeans_score,
    ) = kmeans_clustering(features)

    df["kmeans_cluster"] = kmeans_labels

    print(f"Best k for KMeans: {best_k}")
    print(f"Silhouette score: {kmeans_score:.3f}")
    print(df.head())
    visualize_clusters(
        features,
        kmeans_labels,
        f"KMeans clusters (k={best_k}, score={kmeans_score:.3f})",
        "kmeans_clusters.html",
    )
    

Best k for KMeans: 151
Silhouette score: 0.457
    temp_set  temp_current  mode fan_speed    size  kmeans_cluster
0  23.280663     22.780377   dry      high   small             106
1  26.407350     26.169535  heat    medium   large             107
2  26.793736     26.825880  heat       low   large             108
3  24.634472     24.805452  cool      high   small              47
4  24.828268     24.691426  cool      high  medium              31
