In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.manifold import MDS, TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from scipy.spatial.distance import squareform
# Plotly removido por compatibilidad con NumPy
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

def verify_and_sync_data(distance_matrix, clustering_results, series_matrix):
    """
    Verifica y sincroniza los datos para asegurar compatibilidad
    """
    print("=== VERIFICACIÓN Y SINCRONIZACIÓN DE DATOS ===")
    
    # Verificar tipos de índices
    print(f"Tipos de datos:")
    print(f"  distance_matrix.index: {type(distance_matrix.index[0])}")
    print(f"  series_matrix.index: {type(series_matrix.index[0])}")  
    print(f"  clustering_results.product_id: {type(clustering_results['product_id'].iloc[0])}")
    
    # Convertir todos a string para comparación
    distance_products = set(str(x) for x in distance_matrix.index)
    series_products = set(str(x) for x in series_matrix.index)
    cluster_products = set(str(x) for x in clustering_results['product_id'])
    
    print(f"\nTamaños de conjuntos:")
    print(f"  distance_matrix: {len(distance_products)} productos")
    print(f"  series_matrix: {len(series_products)} productos")
    print(f"  clustering_results: {len(cluster_products)} productos")
    
    # Encontrar intersecciones
    common_all = distance_products & series_products & cluster_products
    common_dist_cluster = distance_products & cluster_products
    common_series_cluster = series_products & cluster_products
    
    print(f"\nIntersecciones:")
    print(f"  Común en los 3: {len(common_all)} productos")
    print(f"  Común distance + cluster: {len(common_dist_cluster)} productos")
    print(f"  Común series + cluster: {len(common_series_cluster)} productos")
    
    # Encontrar productos faltantes
    missing_in_distance = cluster_products - distance_products
    missing_in_series = cluster_products - series_products
    
    if missing_in_distance:
        print(f"\nProductos en clustering pero NO en distance_matrix: {len(missing_in_distance)}")
        print(f"  Ejemplos: {list(missing_in_distance)[:5]}")
        
    if missing_in_series:
        print(f"\nProductos en clustering pero NO en series_matrix: {len(missing_in_series)}")
        print(f"  Ejemplos: {list(missing_in_series)[:5]}")
    
    # Sincronizar clustering_results para usar solo productos disponibles
    if missing_in_distance or missing_in_series:
        print(f"\n=== SINCRONIZANDO DATOS ===")
        available_products = common_all
        
        # Filtrar clustering_results
        clustering_results_sync = clustering_results[
            clustering_results['product_id'].astype(str).isin(available_products)
        ].copy()
        
        print(f"Clustering sincronizado: {len(clustering_results_sync)} productos (era {len(clustering_results)})")
        
        # Verificar que todos los clusters siguen teniendo productos
        cluster_counts = clustering_results_sync['cluster'].value_counts()
        print(f"Distribución de clusters después de sincronización:")
        for cluster, count in cluster_counts.items():
            print(f"  Cluster {cluster}: {count} productos")
        
        return clustering_results_sync
    else:
        print("\n✓ Todos los datos están sincronizados")
        return clustering_results

def load_dtw_results(distance_matrix_path='dtw_matrix_full.csv'):
    """
    Carga la matriz de distancias DTW previamente calculada
    """
    print("Cargando matriz de distancias DTW...")
    distance_df = pd.read_csv(distance_matrix_path, index_col=0)
    
    # Convertir índices a enteros si es posible
    try:
        distance_df.index = distance_df.index.astype(int)
        distance_df.columns = distance_df.columns.astype(int)
        print("Índices convertidos a enteros")
    except:
        print("Manteniendo índices como están")
    
    print(f"Matriz cargada: {distance_df.shape}")
    print(f"Productos: {len(distance_df)}")
    print(f"Primeros índices: {list(distance_df.index[:5])}")
    
    # Verificar que es simétrica
    is_symmetric = np.allclose(distance_df.values, distance_df.values.T, equal_nan=True)
    print(f"Matriz simétrica: {is_symmetric}")
    
    return distance_df

def find_optimal_clusters(distance_matrix, max_clusters=20, methods=['hierarchical', 'kmeans']):
    """
    Encuentra el número óptimo de clusters usando diferentes métodos
    """
    print("=== BÚSQUEDA DE NÚMERO ÓPTIMO DE CLUSTERS ===")
    
    # Convertir a matriz de distancias para sklearn
    distance_array = distance_matrix.values
    
    results = {}
    
    for method in methods:
        print(f"\nAnalizando método: {method}")
        
        silhouette_scores = []
        calinski_scores = []
        davies_bouldin_scores = []
        inertias = []
        
        cluster_range = range(2, min(max_clusters + 1, len(distance_matrix) - 1))
        
        for n_clusters in cluster_range:
            try:
                if method == 'hierarchical':
                    clustering = AgglomerativeClustering(
                        n_clusters=n_clusters,
                        metric='precomputed',
                        linkage='average'
                    )
                    labels = clustering.fit_predict(distance_array)
                    
                elif method == 'kmeans':
                    # Para K-means necesitamos embeddings
                    mds = MDS(n_components=10, dissimilarity='precomputed', random_state=42)
                    embeddings = mds.fit_transform(distance_array)
                    
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                    labels = kmeans.fit_predict(embeddings)
                    inertias.append(kmeans.inertia_)
                
                # Calcular métricas
                if len(np.unique(labels)) > 1:
                    sil_score = silhouette_score(distance_array, labels, metric='precomputed')
                    silhouette_scores.append(sil_score)
                    
                    # Para Calinski-Harabasz y Davies-Bouldin necesitamos embeddings
                    if method == 'hierarchical':
                        mds_temp = MDS(n_components=10, dissimilarity='precomputed', random_state=42)
                        embeddings_temp = mds_temp.fit_transform(distance_array)
                    else:
                        embeddings_temp = embeddings
                    
                    cal_score = calinski_harabasz_score(embeddings_temp, labels)
                    db_score = davies_bouldin_score(embeddings_temp, labels)
                    
                    calinski_scores.append(cal_score)
                    davies_bouldin_scores.append(db_score)
                else:
                    silhouette_scores.append(-1)
                    calinski_scores.append(0)
                    davies_bouldin_scores.append(float('inf'))
                    
            except Exception as e:
                print(f"Error con {n_clusters} clusters: {e}")
                silhouette_scores.append(-1)
                calinski_scores.append(0)
                davies_bouldin_scores.append(float('inf'))
                if method == 'kmeans':
                    inertias.append(float('inf'))
        
        results[method] = {
            'n_clusters': list(cluster_range),
            'silhouette': silhouette_scores,
            'calinski_harabasz': calinski_scores,
            'davies_bouldin': davies_bouldin_scores
        }
        
        if method == 'kmeans':
            results[method]['inertia'] = inertias
        
        # Encontrar óptimos
        if silhouette_scores:
            best_sil = cluster_range[np.argmax(silhouette_scores)]
            best_cal = cluster_range[np.argmax(calinski_scores)]
            best_db = cluster_range[np.argmin(davies_bouldin_scores)]
            
            print(f"  Mejor Silhouette: {best_sil} clusters (score: {max(silhouette_scores):.3f})")
            print(f"  Mejor Calinski-Harabasz: {best_cal} clusters (score: {max(calinski_scores):.1f})")
            print(f"  Mejor Davies-Bouldin: {best_db} clusters (score: {min(davies_bouldin_scores):.3f})")
    
    return results

def plot_cluster_metrics(results):
    """
    Visualiza las métricas de clustering
    """
    n_methods = len(results)
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    colors = ['blue', 'red', 'green', 'purple']
    
    for i, (method, data) in enumerate(results.items()):
        color = colors[i % len(colors)]
        
        # Silhouette Score
        axes[0].plot(data['n_clusters'], data['silhouette'], 
                    marker='o', label=f'{method}', color=color)
        
        # Calinski-Harabasz Score
        axes[1].plot(data['n_clusters'], data['calinski_harabasz'], 
                    marker='s', label=f'{method}', color=color)
        
        # Davies-Bouldin Score
        axes[2].plot(data['n_clusters'], data['davies_bouldin'], 
                    marker='^', label=f'{method}', color=color)
        
        # Inertia (solo para K-means)
        if 'inertia' in data:
            axes[3].plot(data['n_clusters'], data['inertia'], 
                        marker='d', label=f'{method} - Inertia', color=color)
    
    # Configurar gráficos
    axes[0].set_title('Silhouette Score (mayor es mejor)')
    axes[0].set_xlabel('Número de Clusters')
    axes[0].set_ylabel('Silhouette Score')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].set_title('Calinski-Harabasz Score (mayor es mejor)')
    axes[1].set_xlabel('Número de Clusters')
    axes[1].set_ylabel('Calinski-Harabasz Score')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    axes[2].set_title('Davies-Bouldin Score (menor es mejor)')
    axes[2].set_xlabel('Número de Clusters')
    axes[2].set_ylabel('Davies-Bouldin Score')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    axes[3].set_title('Inertia - Método del Codo')
    axes[3].set_xlabel('Número de Clusters')
    axes[3].set_ylabel('Inertia')
    axes[3].legend()
    axes[3].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def perform_hierarchical_clustering(distance_matrix, n_clusters=None):
    """
    Realiza clustering jerárquico
    """
    print("=== CLUSTERING JERÁRQUICO ===")
    
    # Convertir a formato condensado para scipy
    distance_array = distance_matrix.values
    
    # Linkage matrix
    linkage_matrix = linkage(squareform(distance_array), method='average')
    
    # Si no se especifica n_clusters, usar dendrograma para decidir
    if n_clusters is None:
        plt.figure(figsize=(15, 6))
        dendrogram(linkage_matrix, labels=distance_matrix.index.tolist(), 
                  leaf_rotation=90, leaf_font_size=8)
        plt.title('Dendrograma - Clustering Jerárquico')
        plt.xlabel('Productos')
        plt.ylabel('Distancia DTW')
        plt.show()
        
        # Sugerir número de clusters basado en la estructura
        n_clusters = 8  # Valor por defecto
        print(f"Usando {n_clusters} clusters por defecto")
    
    # Obtener clusters
    cluster_labels = fcluster(linkage_matrix, n_clusters, criterion='maxclust')
    
    # Crear DataFrame con resultados
    clustering_results = pd.DataFrame({
        'product_id': distance_matrix.index,
        'cluster': cluster_labels
    })
    
    # Estadísticas
    cluster_counts = clustering_results['cluster'].value_counts().sort_index()
    print(f"\nDistribución de clusters:")
    for cluster, count in cluster_counts.items():
        print(f"  Cluster {cluster}: {count} productos")
    
    return clustering_results, linkage_matrix

def perform_dbscan_clustering(distance_matrix, eps=None, min_samples=5):
    """
    Realiza clustering DBSCAN
    """
    print("=== CLUSTERING DBSCAN ===")
    
    distance_array = distance_matrix.values
    
    # Si no se especifica eps, usar análisis de distancias
    if eps is None:
        # Calcular distancias promedio al k-ésimo vecino más cercano
        k = min_samples
        distances = []
        for i in range(len(distance_array)):
            row_distances = np.sort(distance_array[i])
            distances.append(row_distances[k])  # k-ésimo vecino más cercano
        
        distances = np.sort(distances)
        
        # Plotear para encontrar el "codo"
        plt.figure(figsize=(10, 6))
        plt.plot(distances)
        plt.xlabel('Puntos ordenados por distancia')
        plt.ylabel(f'Distancia al {k}-ésimo vecino más cercano')
        plt.title('Análisis de Eps para DBSCAN')
        plt.grid(True)
        plt.show()
        
        # Sugerir eps basado en el percentil 95
        eps = np.percentile(distances, 95)
        print(f"Eps sugerido: {eps:.2f}")
    
    # Aplicar DBSCAN
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='precomputed')
    cluster_labels = dbscan.fit_predict(distance_array)
    
    # Crear DataFrame con resultados
    clustering_results = pd.DataFrame({
        'product_id': distance_matrix.index,
        'cluster': cluster_labels
    })
    
    # Estadísticas
    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = list(cluster_labels).count(-1)
    
    print(f"\nResultados DBSCAN:")
    print(f"  Número de clusters: {n_clusters}")
    print(f"  Puntos de ruido: {n_noise}")
    print(f"  Eps usado: {eps:.2f}")
    print(f"  Min samples: {min_samples}")
    
    cluster_counts = clustering_results['cluster'].value_counts().sort_index()
    print(f"\nDistribución de clusters:")
    for cluster, count in cluster_counts.items():
        cluster_name = "Ruido" if cluster == -1 else f"Cluster {cluster}"
        print(f"  {cluster_name}: {count} productos")
    
    return clustering_results

def visualize_clusters_2d(distance_matrix, clustering_results, method='MDS'):
    """
    Visualiza clusters en 2D usando reducción de dimensionalidad
    """
    print(f"=== VISUALIZACIÓN 2D - {method} ===")
    
    distance_array = distance_matrix.values
    
    # Reducción de dimensionalidad
    if method == 'MDS':
        reducer = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
        coords_2d = reducer.fit_transform(distance_array)
    elif method == 'TSNE':
        # t-SNE necesita embeddings, no distancias directas
        mds = MDS(n_components=10, dissimilarity='precomputed', random_state=42)
        embeddings = mds.fit_transform(distance_array)
        reducer = TSNE(n_components=2, random_state=42)
        coords_2d = reducer.fit_transform(embeddings)
    
    # Crear DataFrame para visualización
    viz_df = pd.DataFrame({
        'x': coords_2d[:, 0],
        'y': coords_2d[:, 1],
        'product_id': distance_matrix.index,
        'cluster': clustering_results['cluster']
    })
    
    # Plotear con mejor diseño
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))
    
    # Plot 1: Clusters con colores
    unique_clusters = sorted(viz_df['cluster'].unique())
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_clusters)))
    
    for i, cluster in enumerate(unique_clusters):
        cluster_data = viz_df[viz_df['cluster'] == cluster]
        label = f'Ruido ({len(cluster_data)})' if cluster == -1 else f'Cluster {cluster} ({len(cluster_data)})'
        
        ax1.scatter(cluster_data['x'], cluster_data['y'], 
                   c=[colors[i]], label=label, alpha=0.7, s=50)
    
    ax1.set_xlabel(f'{method} Dimensión 1')
    ax1.set_ylabel(f'{method} Dimensión 2')
    ax1.set_title(f'Clusters visualizados con {method}')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Densidad de puntos
    ax2.scatter(viz_df['x'], viz_df['y'], c=viz_df['cluster'], 
               cmap='viridis', alpha=0.6, s=30)
    ax2.set_xlabel(f'{method} Dimensión 1')
    ax2.set_ylabel(f'{method} Dimensión 2')
    ax2.set_title(f'Densidad de clusters - {method}')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return viz_df

def create_interactive_cluster_summary(clustering_results, cluster_characteristics):
    """
    Crea un resumen interactivo de los clusters usando matplotlib
    """
    print("=== RESUMEN DETALLADO DE CLUSTERS ===")
    
    # Crear figura con múltiples subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Distribución de tamaños de clusters
    cluster_sizes = clustering_results['cluster'].value_counts().sort_index()
    axes[0, 0].bar(range(len(cluster_sizes)), cluster_sizes.values, 
                   color='skyblue', alpha=0.7)
    axes[0, 0].set_xlabel('Cluster ID')
    axes[0, 0].set_ylabel('Número de productos')
    axes[0, 0].set_title('Distribución de tamaños de clusters')
    axes[0, 0].set_xticks(range(len(cluster_sizes)))
    axes[0, 0].set_xticklabels(cluster_sizes.index)
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Distancias internas promedio
    if cluster_characteristics:
        cluster_ids = list(cluster_characteristics.keys())
        internal_distances = [cluster_characteristics[c]['internal_avg_distance'] 
                            for c in cluster_ids]
        
        axes[0, 1].bar(range(len(cluster_ids)), internal_distances, 
                      color='lightcoral', alpha=0.7)
        axes[0, 1].set_xlabel('Cluster ID')
        axes[0, 1].set_ylabel('Distancia DTW promedio interna')
        axes[0, 1].set_title('Cohesión interna de clusters')
        axes[0, 1].set_xticks(range(len(cluster_ids)))
        axes[0, 1].set_xticklabels(cluster_ids)
        axes[0, 1].grid(True, alpha=0.3)
        
        # 3. Valores promedio por cluster
        mean_values = [cluster_characteristics[c]['mean_value'] for c in cluster_ids]
        axes[1, 0].bar(range(len(cluster_ids)), mean_values, 
                      color='lightgreen', alpha=0.7)
        axes[1, 0].set_xlabel('Cluster ID')
        axes[1, 0].set_ylabel('Valor promedio')
        axes[1, 0].set_title('Valor promedio por cluster')
        axes[1, 0].set_xticks(range(len(cluster_ids)))
        axes[1, 0].set_xticklabels(cluster_ids)
        axes[1, 0].grid(True, alpha=0.3)
        
        # 4. Correlación con tendencia temporal
        trend_corrs = [cluster_characteristics[c]['trend_correlation'] for c in cluster_ids]
        colors = ['red' if x < 0 else 'green' for x in trend_corrs]
        axes[1, 1].bar(range(len(cluster_ids)), trend_corrs, 
                      color=colors, alpha=0.7)
        axes[1, 1].set_xlabel('Cluster ID')
        axes[1, 1].set_ylabel('Correlación con tendencia')
        axes[1, 1].set_title('Tendencia temporal por cluster')
        axes[1, 1].set_xticks(range(len(cluster_ids)))
        axes[1, 1].set_xticklabels(cluster_ids)
        axes[1, 1].grid(True, alpha=0.3)
        axes[1, 1].axhline(y=0, color='black', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    plt.show()
    
    # Imprimir resumen textual
    print("\n" + "="*60)
    print("RESUMEN EJECUTIVO DE CLUSTERS")
    print("="*60)
    
    for cluster_id in sorted(clustering_results['cluster'].unique()):
        if cluster_id == -1:
            continue
            
        count = len(clustering_results[clustering_results['cluster'] == cluster_id])
        print(f"\nCLUSTER {cluster_id}:")
        print(f"  • Productos: {count}")
        
        if cluster_characteristics and cluster_id in cluster_characteristics:
            char = cluster_characteristics[cluster_id]
            print(f"  • Valor promedio: {char['mean_value']:.2f}")
            print(f"  • Cohesión interna: {char['internal_avg_distance']:.2f}")
            print(f"  • Tendencia: {'Creciente' if char['trend_correlation'] > 0.1 else 'Decreciente' if char['trend_correlation'] < -0.1 else 'Estable'}")
            print(f"  • Variabilidad: {'Alta' if char['mean_std'] > char['mean_value'] * 0.5 else 'Media' if char['mean_std'] > char['mean_value'] * 0.2 else 'Baja'}")
    
    print("\n" + "="*60)

def analyze_cluster_characteristics(distance_matrix, clustering_results, series_matrix):
    """
    Analiza las características de cada cluster
    """
    print("=== ANÁLISIS DE CARACTERÍSTICAS DE CLUSTERS ===")
    
    # Debug: verificar compatibilidad de índices
    print("DEBUG: Verificando compatibilidad de índices...")
    print(f"Índices en distance_matrix: {list(distance_matrix.index[:5])} ... (total: {len(distance_matrix.index)})")
    print(f"Índices en series_matrix: {list(series_matrix.index[:5])} ... (total: {len(series_matrix.index)})")
    print(f"Product_ids en clustering_results: {list(clustering_results['product_id'][:5])} ... (total: {len(clustering_results)})")
    
    # Verificar tipos de datos
    print(f"Tipo de índices distance_matrix: {type(distance_matrix.index[0])}")
    print(f"Tipo de índices series_matrix: {type(series_matrix.index[0])}")
    print(f"Tipo de product_id en clustering: {type(clustering_results['product_id'].iloc[0])}")
    
    results = {}
    
    for cluster_id in sorted(clustering_results['cluster'].unique()):
        if cluster_id == -1:  # Saltar ruido en DBSCAN
            continue
            
        cluster_products = clustering_results[clustering_results['cluster'] == cluster_id]['product_id']
        
        print(f"\n--- CLUSTER {cluster_id} ---")
        print(f"Productos: {len(cluster_products)}")
        
        # Convertir a mismo tipo de dato si es necesario
        cluster_products_list = cluster_products.tolist()
        
        # Verificar qué productos están disponibles en las matrices
        available_in_distance = [p for p in cluster_products_list if p in distance_matrix.index]
        available_in_series = [p for p in cluster_products_list if p in series_matrix.index]
        
        print(f"Productos disponibles en distance_matrix: {len(available_in_distance)}/{len(cluster_products_list)}")
        print(f"Productos disponibles en series_matrix: {len(available_in_series)}/{len(cluster_products_list)}")
        
        if len(available_in_series) == 0:
            print(f"ADVERTENCIA: No se encontraron productos del cluster {cluster_id} en series_matrix")
            continue
            
        # Estadísticas de las series temporales
        try:
            cluster_series = series_matrix.loc[available_in_series]
            
            # Estadísticas descriptivas
            mean_series = cluster_series.mean()
            std_series = cluster_series.std()
            
            print(f"Valor promedio: {mean_series.mean():.2f}")
            print(f"Desviación estándar promedio: {std_series.mean():.2f}")
            
            # Verificar que mean_series no esté vacío
            if len(mean_series) > 1:
                trend_corr = np.corrcoef(range(len(mean_series)), mean_series)[0,1]
                print(f"Tendencia (correlación con tiempo): {trend_corr:.3f}")
            else:
                trend_corr = 0.0
                print("Tendencia: No se puede calcular (datos insuficientes)")
                
        except Exception as e:
            print(f"Error calculando estadísticas de series: {e}")
            mean_series = pd.Series([0])
            std_series = pd.Series([0])
            trend_corr = 0.0
        
        # Cohesión interna (distancias DTW dentro del cluster)
        try:
            if len(available_in_distance) >= 2:
                cluster_distances = distance_matrix.loc[available_in_distance, available_in_distance]
                internal_distances = cluster_distances.values[np.triu_indices_from(cluster_distances.values, k=1)]
                
                if len(internal_distances) > 0:
                    avg_distance = np.mean(internal_distances)
                    max_distance = np.max(internal_distances)
                    print(f"Distancia DTW interna promedio: {avg_distance:.2f}")
                    print(f"Distancia DTW interna máxima: {max_distance:.2f}")
                else:
                    avg_distance = 0.0
                    max_distance = 0.0
                    print("Distancias DTW: No se pueden calcular (cluster muy pequeño)")
            else:
                avg_distance = 0.0
                max_distance = 0.0
                print("Distancias DTW: No se pueden calcular (productos insuficientes en matriz)")
                
        except Exception as e:
            print(f"Error calculando distancias DTW: {e}")
            avg_distance = 0.0
            max_distance = 0.0
        
        # Guardar resultados
        results[cluster_id] = {
            'products': cluster_products_list,
            'products_in_distance_matrix': available_in_distance,
            'products_in_series_matrix': available_in_series,
            'size': len(cluster_products_list),
            'mean_value': mean_series.mean() if not mean_series.empty else 0.0,
            'mean_std': std_series.mean() if not std_series.empty else 0.0,
            'trend_correlation': trend_corr,
            'internal_avg_distance': avg_distance,
            'internal_max_distance': max_distance,
            'mean_series': mean_series.tolist() if not mean_series.empty else [0]
        }
    
    return results

def plot_cluster_profiles(cluster_characteristics, series_matrix):
    """
    Plotea los perfiles promedio de cada cluster
    """
    print("=== PERFILES DE CLUSTERS ===")
    
    n_clusters = len(cluster_characteristics)
    cols = 3
    rows = (n_clusters + cols - 1) // cols
    
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
    if rows == 1:
        axes = axes.reshape(1, -1)
    
    for i, (cluster_id, data) in enumerate(cluster_characteristics.items()):
        row = i // cols
        col = i % cols
        
        ax = axes[row, col]
        
        # Obtener series del cluster
        cluster_products = data['products']
        cluster_series = series_matrix.loc[cluster_products]
        
        # Plotear series individuales (transparentes)
        for product in cluster_products[:20]:  # Limitar para visualización
            ax.plot(cluster_series.loc[product], alpha=0.3, color='lightblue', linewidth=0.5)
        
        # Plotear promedio (destacado)
        mean_series = pd.Series(data['mean_series'], index=series_matrix.columns)
        ax.plot(mean_series, color='red', linewidth=3, label='Promedio')
        
        ax.set_title(f'Cluster {cluster_id} ({data["size"]} productos)')
        ax.set_xlabel('Período')
        ax.set_ylabel('Valor')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    # Ocultar subplots vacíos
    for i in range(n_clusters, rows * cols):
        row = i // cols
        col = i % cols
        axes[row, col].set_visible(False)
    
    plt.tight_layout()
    plt.show()

def save_clustering_results(clustering_results, cluster_characteristics, filename_prefix='clustering_results'):
    """
    Guarda los resultados de clustering
    """
    # Guardar asignaciones de clusters
    clustering_results.to_csv(f'{filename_prefix}_assignments.csv', index=False)
    
    # Guardar características de clusters
    import json
    with open(f'{filename_prefix}_characteristics.json', 'w') as f:
        json.dump(cluster_characteristics, f, indent=2)
    
    print(f"Resultados guardados:")
    print(f"  - {filename_prefix}_assignments.csv")
    print(f"  - {filename_prefix}_characteristics.json")

# PIPELINE COMPLETO DE CLUSTERING
def full_clustering_pipeline(distance_matrix_path='dtw_matrix_full.csv', 
                           series_matrix_path=None):
    """
    Pipeline completo de clustering con DTW
    """
    print("=== PIPELINE COMPLETO DE CLUSTERING DTW ===")
    
    # 1. Cargar matriz de distancias
    distance_matrix = load_dtw_results(distance_matrix_path)
    
    # 2. Cargar matriz de series (si está disponible)
    if series_matrix_path:
        series_matrix = pd.read_csv(series_matrix_path, index_col=0)
    else:
        # Recrear desde datos originales
        import os
        pathdata = 'data/'
        filename = 'sell-in.txt'
        filepath = os.path.join(pathdata, filename)
        sell = pd.read_csv(filepath, sep="\t")
        
        df_agg = sell.groupby(['periodo', 'product_id'], as_index=False).agg({
            'tn': 'sum'
        })
        
        series_matrix = df_agg.pivot(index='product_id', columns='periodo', values='tn')
        series_matrix = series_matrix.fillna(method='ffill').fillna(method='bfill').fillna(0)
    
    # 3. Encontrar número óptimo de clusters
    cluster_metrics = find_optimal_clusters(distance_matrix, max_clusters=15)
    plot_cluster_metrics(cluster_metrics)
    
    # 4. Aplicar clustering jerárquico
    hierarchical_results, linkage_matrix = perform_hierarchical_clustering(distance_matrix, n_clusters=6)
    
    # 5. Aplicar DBSCAN
    dbscan_results = perform_dbscan_clustering(distance_matrix, eps=None, min_samples=5)
    
    # 5.5. Sincronizar datos antes del análisis
    print("\n=== SINCRONIZACIÓN DE DATOS ===")
    hierarchical_results_sync = verify_and_sync_data(distance_matrix, hierarchical_results, series_matrix)
    dbscan_results_sync = verify_and_sync_data(distance_matrix, dbscan_results, series_matrix)
    
    # 6. Visualizar resultados
    print("\n=== VISUALIZACIÓN CLUSTERING JERÁRQUICO ===")
    viz_hierarchical = visualize_clusters_2d(distance_matrix, hierarchical_results_sync, method='MDS')
    
    print("\n=== VISUALIZACIÓN CLUSTERING DBSCAN ===")
    viz_dbscan = visualize_clusters_2d(distance_matrix, dbscan_results_sync, method='MDS')
    
    # 7. Analizar características (clustering jerárquico)
    cluster_chars = analyze_cluster_characteristics(distance_matrix, hierarchical_results_sync, series_matrix)
    
    # 8. Crear resumen interactivo
    create_interactive_cluster_summary(hierarchical_results, cluster_chars)
    
    # 9. Plotear perfiles
    plot_cluster_profiles(cluster_chars, series_matrix)
    
    # 9. Guardar resultados
    save_clustering_results(hierarchical_results, cluster_chars, 'hierarchical_clustering')
    save_clustering_results(dbscan_results, {}, 'dbscan_clustering')
    
    print("\n=== PIPELINE COMPLETO ===")
    return {
        'distance_matrix': distance_matrix,
        'series_matrix': series_matrix,
        'hierarchical_results': hierarchical_results_sync,
        'dbscan_results': dbscan_results_sync,
        'cluster_characteristics': cluster_chars,
        'linkage_matrix': linkage_matrix
    }

# EJEMPLO DE USO:
"""
# Ejecutar pipeline completo
results = full_clustering_pipeline('dtw_matrix_full.csv')

# O ejecutar pasos individuales:
# distance_matrix = load_dtw_results('dtw_matrix_full.csv')
# cluster_metrics = find_optimal_clusters(distance_matrix)
# hierarchical_results, linkage_matrix = perform_hierarchical_clustering(distance_matrix, n_clusters=8)
"""

In [None]:
distance_matrix = load_dtw_results('clusters/dtw_matrix_octubre19.csv')

In [None]:
hierarchical_results, linkage_matrix = perform_hierarchical_clustering(distance_matrix, n_clusters=50)

In [None]:
hierarchical_results.to_csv("clusters/hierarchical_results_agosto_50.csv", index=False)

In [None]:
results = full_clustering_pipeline('dtw_matrix_full.csv')