In [None]:
import os
import matplotlib

# Use a non-interactive backend automatically when running headless (CI/servers)
if os.environ.get("DISPLAY", "") == "" and os.environ.get("MPLBACKEND", "") == "":
    matplotlib.use("Agg") # Must be set before importing pyplot
import joblib
import logging
from dataclasses import dataclass, field 
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, cast

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.figure import Figure
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from kneed import KneeLocator

# Set up logging - like a diary
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(names)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class ClusteringConfig:
    """

    Configuration for clustering analysis.

    Think of this like: A settings menu in a video game where you choose
    diffculty level and other options before playing.

    """
    # File Paths
    output_dir: str = 'plots'
    model_save_path: str = 'ml_models/clustering_model.pkl'
    scaler_save_path: str = 'ml_models/clustering_scaler.pkl'

    # Clustering Parameters
    min_clusters: int = 2 # Minimum number of groups
    max_clusters: int = 8 # Maximum number of groups
    random_state: int = 42 # For reproducibility (consistency)
    n_init: int = 10 # How many times to try clustering
    max_iter: int = 300 # Maximum iterations per attempt

    # Feature selection
    base_features: List[str] = field(default_factory=lambda: [
        'Latitude',
        'Longitude',
        'Avg_Annual_Spills'
    ])

    optional_features: List[str] = field(default_factory=lambda: [
        'Spill Trend',
        'Predicted Annual Spill Frequence Post Scheme',
        'Ecological High Priority Site Flag',
        'Non-bathing Priority Site Flag',
        'Bathing Water Discharge Flag',
        'Shellfish Water Discharge Flag'
    ])

    # Visualization
    figure_size: Tuple[int, int] = (16, 12)
    dpi: int = 300

    def __post_init__(self):
        """Create directories after initalization."""
        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
        Path(self.model_save_path).parent.mkdir(parents=True, exist_ok=True)
        Path(self.scaler_save_path).parent.mkdir(parents=True, exist_ok=True)
    
    def get_feature_list(self) -> List[str]:
        """Return a list of all features to use in clustering."""
        return self.base_features + self.optional_features
    
class ClusterValidator:
    """
    Validates clustering results to ensure they're good quality.

    Think of this like: A quality inspector who checks if your toy sorting
    actually makes sense - are similar toys really grouped together?

    """
    @staticmethod
    def calcuate_cluster_metrics(
        X: np.ndarray,
        labels: np.ndarray
    ) -> Dict[str, float]:
        """
        Calcuate how good the clustering is.

        Args:
            X: The feature data
            labels: Cluster assignments
        
        Returns:
            Dictionary of quality scores
        
        Think of this like: Getting a report card for your sorting job!
        """
        n_clusters = len(np.unique(labels))

        # Skip metrics if only 1 cluster (no comparsion possible)
        if n_clusters < 2:
            return {
                'silhouette_score': 0.0,
                'davies_bouldin_score': 0.0,
                'calinski_harabasz_score': 0.0
            }
        try:
            # Silhouette Score: -1 to 1 (higher is better)
            # Measures how similar objects are to their own cluster compared to others own
            silhouette = silhouette_score(X, labels)

            # Davies-Bouldin Score: 0 to infinity (lower is better)
            # Measures how much overlap there is between clusters
            db_score = davies_bouldin_score(X, labels)

            # Calinski-Harabasz Score: 0 to infinity (higher is better)
            # Measures how much clusters are well-separated from each other
            ch_score = calinski_harabasz_score(X, labels)

            return {
                'silhouette_score': silhouette,
                'davies_bouldin_score': db_score,
                'calinski_harabasz_score': ch_score
            }
            
        except Exception as e: 
            logger.warning(f"Error calcuating cluster metrics: {e}")
            return {
                'silhouette_score': 0.0,
                'davies_bouldin_score': 0.0,
                'calinski_harabasz_score': 0.0
            }

    @staticmethod
    def interpret_metrics(metrics: Dict[str, float]) -> str:
        """

        Explain what the metrics mean in plain English.

        Think of this like: Your teacher explaining what your grades mean.
        """
        silhouette = metrics['silhouette_score']

        if silhouette > 0.7:
            quality = "Excellent - Clusters are very distinct and well-seperated"
        elif silhouette > 0.5:
            quality = "Good - Clusters are reasonably well-defined"
        elif silhouette > 0.3:
            quality = "Fair - Clusters have some overlap"
        else:
            quality = "Poor - Clusters are not well-defined"
        
        return quality
    
    class OptimalClusterFinder:
        """
        Finds the best number of clusters automatically.

        Think of this like: A smart helper that figures out exactly how many
        toy bins you need - not too many, not too few, just right!
        """

        def __init__(self, config: ClusteringConfig):
            self.config = config
            self.inertias = []
            self.k_range = range(config.min_clusters, config.max_clusters + 1)
        
        def find_optimal_k(
            self,
            X: np.ndarray
        ) -> Tuple[int, List[float]]:
            """
            Use the elbow method to find optimal number of clusters.

            Args:
                X: Scaled feature data
            
            Returns:
                optimal_k: Best number of clusters
                inertias: List of inertia values for each k

            Think of this like: Trying different numbers of toys bins and finding
            the sweet spot where organization is best!
            """
            logger.info("Finding optimal number of clusters using elbow method")

            self.inertias = []

            # Try different number of clusters
            for k in self.k_range:
                kmeans = KMeans(
                    n_clusters=k,
                    random_state=self.config.random_state,
                    n_init=self.config.n_init,  # type: ignore
                    max_iter=self.config.max_iter
                )
                kmeans.fit(X)
                self.inertias.append(kmeans.inertia_)

            # Plot elbow curve
            plt.figure(figsize=(10, 5))
            plt.plot(self.k_range, self.inertias, marker='o', linestyle='--')
            
            # Use Kneelocator to find the "elbow" point automatically
            try:
                knee_locator = KneeLocator(
                    list(self.k_range),
                    self.inertias,
                    curve='convex',
                    direction='decreasing'
                )
                optimal_k = knee_locator.elbow

                # If no clear elbow found, use middle value
                if optimal_k is None:
                    optimal_k = (self.config.min_clusters + self.config.max_clusters) // 2
                    logger.warning(f"No clear data found, using k={optimal_k}")
                
                else:
                    logger.info(f"Optimal k found: {optimal_k}")
                
            except Exception as e:
                # Fallback to middle value if knee detection fails
                optimal_k = (self.config.min_clusters + self.config.max_clusters) // 2
                logger.warning(f"Knee detection failed: {e}. Using k={optimal_k}")

            return optimal_k, self.inertias
        
        class SiteClusterer:
            """
            Main class for clustering water quality monitoring sites.

            Think of this like: The master organizer who takes all your toys
            and sort them into neat, logical groups
            """

            def __init__(self, config: Optional[ClusteringConfig] = None):
                """Initalize the clusterer."""
                self.config = config or ClusteringConfig()
                self.model = None
                self.scaler = None
                self.featuree_names = []
                self.cluster_analysis = {}
                self.optimal_k = None
            
            def prepare_features(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
                """
                Prepare and select features for clustering.

                Think of this like: Picking out which characteristics of your toys
                to use for sorting (color, size, type, etc.)

                Args:
                    df: Input dataframe

                Returns:
                    DataFrame with selected features, or None if validation fails
                """
                # Check base required columns
                missing_base = [col for col in self.config.base_features if col not in df.columns]
                if missing_base:
                    logger.warning(f"Missing required base features: {missing_base}. Exiting.")
                    return None
                
                # Start with base features
                selected_features = self.config.base_features.copy()

                # Add optional features if available
                for feature in self.config.optional_features:
                    if feature in df.columns:
                        selected_features.append(feature)
                        logger.info(f"Added optinal feature: {feature}")
                
                # Extract features - ensure it's a DataFrame (not Series)
                # Using cast to tell type checker this is always a DataFrame
                feature_df = cast(pd.DataFrame, df[selected_features].copy())

                # Convert flag columns to integers if they exist
                flag_columns = [col for col in feature_df.columns if 'Flag' in col]
                for col in flag_columns:
                    feature_df[col] = feature_df[col].astype(int)

                # Handle missing values with median (more robust than mean)
                feature_df = feature_df.fillna(feature_df.median())

                # Store feature names
                self.feature_names = feature_df.columns.tolist()

                # Log feature selection
                logger.info(f"Selected features: {len(self.feature_names)} features: {self.feature_names}")

                return feature_df
                
                