In [1]:
import numpy as np
import pandas as pd
import os
import requests
from tqdm import tqdm
import time as tm
import optuna
from hdbscan.validity import validity_index
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm  
import warnings
import matplotlib.pyplot as plt
import hdbscan
from sklearn.metrics import silhouette_score
from sklearn.kernel_approximation import Nystroem
from sklearn.manifold import Isomap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir('c:/Users/admin/OneDrive/Documents/TrabajoTesis') 

In [3]:
df2 = pd.read_csv('NeoModelos/minas_con_tiempos_puertos.csv')

In [4]:
df2 = df2[df2['Estado'] == 'ACTIVA']
df2 = df2[(df2['RecursoPrimarioInstalacion'] == 'COBRE') | (df2['RecursoMineroInstalacion'] == 'SALMUERA (LITIO)')] 


In [5]:
drop_cols = ['RutEmpresa','NombreEmpresa','RecursoMineroInstalacion','TipoInstalacion',
             'TipoRecursoInstalacion','RecursoPrimarioInstalacion', 'ComunaFaena', 
             'NombreFaena', 'CategoriaFaena', 'IdFaena', 'ProvinciaInstalacion', 
             'ComunaInstalacion','NombreInstalacion','IdTipoInstalacion','IdInstalacion',
             'Norte','Este','Huso','Datum','IdEstado','Estado']

# Also drop distance columns as requested
distance_cols = [col for col in df2.columns if col.startswith('dist_')]
all_cols_to_drop = drop_cols + distance_cols

df = df2.drop(columns=all_cols_to_drop)

print(f"Original shape: {df2.shape}")
print(f"After dropping columns: {df.shape}")
print(f"Remaining columns: {list(df.columns)}")

Original shape: (7930, 96)
After dropping columns: (7930, 60)
Remaining columns: ['RegionFaena', 'ProvinciaFaena', 'RegionInstalacion', 'Cota', 'Latitud', 'Longitud', 'time_Santiago_min', 'time_Valpara√≠so_min', 'time_Vi√±a_del_Mar_min', 'time_Antofagasta_min', 'time_La_Serena_min', 'time_Coquimbo_min', 'time_Rancagua_min', 'time_Talca_min', 'time_Iquique_min', 'time_Arica_min', 'time_Calama_min', 'time_Copiap√≥_min', 'time_San_Bernardo_min', 'time_Curic√≥_min', 'time_Ovalle_min', 'Tiempo_Prt_Antofagasta', 'Tiempo_Prt_Bahia Agua Fresca', 'Tiempo_Prt_Bahia De Valdivia', 'Tiempo_Prt_Bahia De Valparaiso', 'Tiempo_Prt_Bahia Harris', 'Tiempo_Prt_Bahia Herradura Guayacan', 'Tiempo_Prt_Bahia Quintero (Ventanas)', 'Tiempo_Prt_Bahia San Vicente', 'Tiempo_Prt_Caleta Clarencia', 'Tiempo_Prt_Caleta Mina Elena', 'Tiempo_Prt_Caleta Patillos', 'Tiempo_Prt_Coquimbo', 'Tiempo_Prt_Coronel', 'Tiempo_Prt_Huasco', 'Tiempo_Prt_Iquique', 'Tiempo_Prt_Lirquen', 'Tiempo_Prt_Lota', 'Tiempo_Prt_Mejillones', 'Tiem

In [6]:
import numpy as np
import pandas as pd
# --- CONFIGURACI√ìN ---
# Umbral de corte. 0.95 es el est√°ndar en la industria.
# Significa: "Si la columna A y B se parecen en un 95%, borra B".
CORRELATION_THRESHOLD = 0.999

print(f"--- Buscando Columnas Redundantes (Corr > {CORRELATION_THRESHOLD}) ---")

# 1. Calcular matriz de correlaci√≥n (solo num√©ricas)
# Usamos el valor absoluto porque una correlaci√≥n de -0.99 es tan redundante como 0.99
df_numeric = df.select_dtypes(include=[np.number])
corr_matrix = df_numeric.corr().abs()

# 2. Seleccionar el tri√°ngulo superior de la matriz
# (La matriz es sim√©trica, no queremos borrar ambas columnas, solo una)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 3. Identificar columnas a borrar
to_drop = [column for column in upper.columns if any(upper[column] > CORRELATION_THRESHOLD)]

if len(to_drop) > 0:
    print(f"‚ö†Ô∏è Se encontraron {len(to_drop)} columnas redundantes para eliminar:")
    # Imprimir las primeras 10 para no llenar la pantalla
    print(to_drop[:10], "..." if len(to_drop) > 10 else "")
    
    # 4. Eliminar del DataFrame original
    df = df.drop(columns=to_drop)
    print(f"‚úÖ Columnas eliminadas. Nueva dimensi√≥n: {df.shape}")
else:
    print("‚úÖ No se encontr√≥ redundancia excesiva.")

# Verificaci√≥n
print(f"Columnas restantes: {df.shape[1]}")

--- Buscando Columnas Redundantes (Corr > 0.999) ---
‚ö†Ô∏è Se encontraron 37 columnas redundantes para eliminar:
['time_Vi√±a_del_Mar_min', 'time_Coquimbo_min', 'time_Talca_min', 'time_San_Bernardo_min', 'time_Curic√≥_min', 'Tiempo_Prt_Antofagasta', 'Tiempo_Prt_Bahia De Valdivia', 'Tiempo_Prt_Bahia De Valparaiso', 'Tiempo_Prt_Bahia Harris', 'Tiempo_Prt_Bahia Herradura Guayacan'] ...
‚úÖ Columnas eliminadas. Nueva dimensi√≥n: (7930, 23)
Columnas restantes: 23


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7930 entries, 0 to 15151
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   RegionFaena                   7930 non-null   object 
 1   ProvinciaFaena                7930 non-null   object 
 2   RegionInstalacion             7930 non-null   object 
 3   Cota                          7930 non-null   int64  
 4   Latitud                       7930 non-null   float64
 5   Longitud                      7930 non-null   float64
 6   time_Santiago_min             7928 non-null   float64
 7   time_Valpara√≠so_min           7928 non-null   float64
 8   time_Antofagasta_min          7930 non-null   float64
 9   time_La_Serena_min            7929 non-null   float64
 10  time_Rancagua_min             7928 non-null   float64
 11  time_Iquique_min              7928 non-null   float64
 12  time_Arica_min                7929 non-null   float64
 13  time_C

In [8]:

# ==========================================
# PREPARACI√ìN DE DATOS (Correcci√≥n dtype=int)
# ==========================================

# 1. One-Hot Encoding con dtype=int expl√≠cito
# 'dtype=int' fuerza a que sean 1 y 0, no True/False.
df_encoded = pd.get_dummies(df, columns=['ProvinciaFaena'], drop_first=True, dtype=int)

# 2. Ahora s√≠, seleccionamos num√©ricas (los int pasar√°n el filtro)
df_model = df_encoded.select_dtypes(include=[np.number]).dropna()

# 3. Guardar nombres y crear matriz X
feature_names = df_model.columns.tolist()
X = df_model.values
valid_indices = df_model.index
print(f"‚úÖ DataFrame corregido.")
print(f"   Filas: {X.shape[0]}")
print(f"   Columnas Totales: {len(feature_names)}")

# --- MAPEO DE √çNDICES ---
geo_idxs = [i for i, col in enumerate(feature_names) if col in ['Latitud', 'Longitud']]
log_idxs = [i for i, col in enumerate(feature_names) if col.startswith('time_') or col.startswith('Tiempo_Prt_')]
# Ahora esto deber√≠a encontrar las columnas porque son 'int' y sobrevivieron el filtro
cat_idxs = [i for i, col in enumerate(feature_names) if col.startswith('ProvinciaFaena_')]
other_idxs = [i for i in range(len(feature_names)) if i not in geo_idxs + log_idxs + cat_idxs]

print(f"\nResumen de Variables:")
print(f"   - Geogr√°ficas: {len(geo_idxs)}")
print(f"   - Log√≠sticas (Tiempos): {len(log_idxs)}")
print(f"   - Categ√≥ricas (Provincias): {len(cat_idxs)} ")
print(f"   - Otras: {len(other_idxs)}")



# 4. Escalado
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)


‚úÖ DataFrame corregido.
   Filas: 7908
   Columnas Totales: 47

Resumen de Variables:
   - Geogr√°ficas: 2
   - Log√≠sticas (Tiempos): 17
   - Categ√≥ricas (Provincias): 27 
   - Otras: 1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7930 entries, 0 to 15151
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   RegionFaena                   7930 non-null   object 
 1   ProvinciaFaena                7930 non-null   object 
 2   RegionInstalacion             7930 non-null   object 
 3   Cota                          7930 non-null   int64  
 4   Latitud                       7930 non-null   float64
 5   Longitud                      7930 non-null   float64
 6   time_Santiago_min             7928 non-null   float64
 7   time_Valpara√≠so_min           7928 non-null   float64
 8   time_Antofagasta_min          7930 non-null   float64
 9   time_La_Serena_min            7929 non-null   float64
 10  time_Rancagua_min             7928 non-null   float64
 11  time_Iquique_min              7928 non-null   float64
 12  time_Arica_min                7929 non-null   float64
 13  time_C

In [None]:
# Silenciamos la advertencia espec√≠fica de compatibilidad sklearn/hdbscan
warnings.filterwarnings("ignore", category=FutureWarning, message=".*force_all_finite.*")
warnings.filterwarnings("ignore", category=FutureWarning, message=".*ensure_all_finite.*")
def objective(trial):
    # --- A. Ponderaci√≥n de Variables (Feature Weighting) ---

    # Mantenemos Geo fijo en 1.0 como referencia (ancla).
    w_geo = 1.0
    
    # Optuna probar√° qu√© tan importantes son los tiempos y las provincias
    w_log = trial.suggest_float("weight_logistics", 1.0, 3.0)   
    w_cat = trial.suggest_float("weight_province", 0.5, 1.5)    
    w_oth = trial.suggest_float("weight_other", 0.5, 1.5)       

    # Aplicar pesos 
    X_weighted = X_scaled.copy()
    
    # Columnas por su peso
    if geo_idxs: X_weighted[:, geo_idxs] *= w_geo
    if log_idxs: X_weighted[:, log_idxs] *= w_log
    if cat_idxs: X_weighted[:, cat_idxs] *= w_cat
    if other_idxs: X_weighted[:, other_idxs] *= w_oth

    # --- B. Reducci√≥n de Dimensionalidad (PCA) ---
    # Ajustamos n_components din√°micamente
    max_components = min(25, X.shape[1])
    n_components = trial.suggest_int("n_components", 5, max_components)
    
    pca = PCA(n_components=n_components, random_state=42)
    # IMPORTANTE: Pasamos la matriz pesada (X_weighted)
    X_pca = pca.fit_transform(X_weighted)

    # --- C. Clustering (HDBSCAN) ---
    min_cluster_size = trial.suggest_int("min_cluster_size", 10, 40)
    min_samples = trial.suggest_int("min_samples", 15, 35)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='leaf',
        gen_min_span_tree=True 
    ).fit(X_pca)

    labels = clusterer.labels_
    unique_labels = set(labels)
    n_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)

    # --- Penalizaciones y Score ---
    if n_clusters < 30: return -1.0

    try:
        score = validity_index(X_pca, labels, metric='euclidean')
    except Exception:
        score = -1.0

    # Penalizaci√≥n por fragmentaci√≥n excesiva
    if n_clusters > 72:
        score -= 0.02 * (n_clusters - 72)
        
    # Penalizaci√≥n extra si ignora demasiado la geograf√≠a (opcional)
    # Si w_log es muy alto, podr√≠a crear clusters que no tienen sentido geogr√°fico
    if w_log > 2.5: 
        score -= 0.05

    return score

# --- Ejecutar Optimizaci√≥n ---
print("Iniciando optimizaci√≥n con Pesos Din√°micos...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\n‚úÖ ¬°Optimizaci√≥n lista!")
print("Mejores pesos encontrados:")
print(f"  - Log√≠stica (Tiempos): {study.best_params.get('weight_logistics', 1.0):.2f}")
print(f"  - Provincias: {study.best_params.get('weight_province', 1.0):.2f}")

Iniciando optimizaci√≥n con Pesos Din√°micos...

[I 2025-11-20 14:51:15,650] A new study created in memory with name: no-name-e53aa057-54a5-450a-82fb-d26a94fe3218





Best trial: 0. Best value: 0.248596:   2%|‚ñè         | 1/50 [00:01<01:02,  1.28s/it]

[I 2025-11-20 14:51:16,927] Trial 0 finished with value: 0.24859647989893974 and parameters: {'weight_logistics': 2.3189874266659043, 'weight_province': 1.0840736996175475, 'weight_other': 1.0597593286363018, 'n_components': 25, 'min_cluster_size': 37, 'min_samples': 22}. Best is trial 0 with value: 0.24859647989893974.


Best trial: 0. Best value: 0.248596:   4%|‚ñç         | 2/50 [00:02<01:03,  1.32s/it]

[I 2025-11-20 14:51:18,281] Trial 1 finished with value: -0.498436782231232 and parameters: {'weight_logistics': 2.741707925431612, 'weight_province': 0.5776863888248839, 'weight_other': 0.918505240874647, 'n_components': 10, 'min_cluster_size': 15, 'min_samples': 19}. Best is trial 0 with value: 0.24859647989893974.


Best trial: 0. Best value: 0.248596:   6%|‚ñå         | 3/50 [00:03<00:53,  1.14s/it]

[I 2025-11-20 14:51:19,210] Trial 2 finished with value: 0.2219929482387229 and parameters: {'weight_logistics': 2.819610379170944, 'weight_province': 1.400804728153087, 'weight_other': 0.6083083660133827, 'n_components': 10, 'min_cluster_size': 17, 'min_samples': 35}. Best is trial 0 with value: 0.24859647989893974.


Best trial: 0. Best value: 0.248596:   8%|‚ñä         | 4/50 [00:04<00:54,  1.18s/it]

[I 2025-11-20 14:51:20,449] Trial 3 finished with value: 0.055546651592282364 and parameters: {'weight_logistics': 2.164098595981754, 'weight_province': 1.1527201420066167, 'weight_other': 0.8447544957251037, 'n_components': 20, 'min_cluster_size': 16, 'min_samples': 23}. Best is trial 0 with value: 0.24859647989893974.


Best trial: 4. Best value: 0.251023:  10%|‚ñà         | 5/50 [00:06<00:54,  1.21s/it]

[I 2025-11-20 14:51:21,723] Trial 4 finished with value: 0.25102283887053184 and parameters: {'weight_logistics': 1.6705165682767218, 'weight_province': 1.4138029219291761, 'weight_other': 0.9415023937962291, 'n_components': 17, 'min_cluster_size': 20, 'min_samples': 26}. Best is trial 4 with value: 0.25102283887053184.


Best trial: 4. Best value: 0.251023:  12%|‚ñà‚ñè        | 6/50 [00:07<00:50,  1.14s/it]

[I 2025-11-20 14:51:22,720] Trial 5 finished with value: 0.24499634686796495 and parameters: {'weight_logistics': 2.5027724593114264, 'weight_province': 1.0816643793595815, 'weight_other': 0.8807496885318664, 'n_components': 6, 'min_cluster_size': 26, 'min_samples': 21}. Best is trial 4 with value: 0.25102283887053184.


Best trial: 6. Best value: 0.257257:  14%|‚ñà‚ñç        | 7/50 [00:08<00:51,  1.19s/it]

[I 2025-11-20 14:51:24,024] Trial 6 finished with value: 0.2572570331264467 and parameters: {'weight_logistics': 1.6346655406951163, 'weight_province': 1.2834052581043602, 'weight_other': 0.6986149933310482, 'n_components': 21, 'min_cluster_size': 31, 'min_samples': 19}. Best is trial 6 with value: 0.2572570331264467.


Best trial: 6. Best value: 0.257257:  16%|‚ñà‚ñå        | 8/50 [00:09<00:50,  1.19s/it]

[I 2025-11-20 14:51:25,216] Trial 7 finished with value: 0.15906223963931115 and parameters: {'weight_logistics': 2.8786948622983553, 'weight_province': 0.7014402397772767, 'weight_other': 0.8077406055941752, 'n_components': 24, 'min_cluster_size': 15, 'min_samples': 26}. Best is trial 6 with value: 0.2572570331264467.


Best trial: 8. Best value: 0.267822:  18%|‚ñà‚ñä        | 9/50 [00:10<00:49,  1.20s/it]

[I 2025-11-20 14:51:26,437] Trial 8 finished with value: 0.2678221859627455 and parameters: {'weight_logistics': 1.5565237932902711, 'weight_province': 1.1303747474840726, 'weight_other': 0.8932186247822492, 'n_components': 25, 'min_cluster_size': 33, 'min_samples': 34}. Best is trial 8 with value: 0.2678221859627455.


Best trial: 8. Best value: 0.267822:  20%|‚ñà‚ñà        | 10/50 [00:11<00:44,  1.11s/it]

[I 2025-11-20 14:51:27,353] Trial 9 finished with value: 0.23886887949611352 and parameters: {'weight_logistics': 2.8339138363829828, 'weight_province': 1.2123959800420008, 'weight_other': 0.9169375167786735, 'n_components': 8, 'min_cluster_size': 17, 'min_samples': 35}. Best is trial 8 with value: 0.2678221859627455.


Best trial: 8. Best value: 0.267822:  22%|‚ñà‚ñà‚ñè       | 11/50 [00:12<00:44,  1.15s/it]

[I 2025-11-20 14:51:28,587] Trial 10 finished with value: 0.26377128142341705 and parameters: {'weight_logistics': 1.0003125607832, 'weight_province': 0.8677190527117696, 'weight_other': 1.3777380960134178, 'n_components': 14, 'min_cluster_size': 39, 'min_samples': 30}. Best is trial 8 with value: 0.2678221859627455.


Best trial: 8. Best value: 0.267822:  24%|‚ñà‚ñà‚ñç       | 12/50 [00:14<00:49,  1.30s/it]

[I 2025-11-20 14:51:30,215] Trial 11 finished with value: 0.2624078906990618 and parameters: {'weight_logistics': 1.2503048891913067, 'weight_province': 0.8976568805041161, 'weight_other': 1.3453514801297075, 'n_components': 14, 'min_cluster_size': 40, 'min_samples': 30}. Best is trial 8 with value: 0.2678221859627455.


Best trial: 12. Best value: 0.288727:  26%|‚ñà‚ñà‚ñå       | 13/50 [00:16<00:52,  1.41s/it]

[I 2025-11-20 14:51:31,897] Trial 12 finished with value: 0.2887268370298813 and parameters: {'weight_logistics': 1.0239121426207547, 'weight_province': 0.8942423621741848, 'weight_other': 1.44783073695868, 'n_components': 14, 'min_cluster_size': 33, 'min_samples': 30}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  28%|‚ñà‚ñà‚ñä       | 14/50 [00:17<00:50,  1.42s/it]

[I 2025-11-20 14:51:33,326] Trial 13 finished with value: 0.25510793096469353 and parameters: {'weight_logistics': 1.5680963444537483, 'weight_province': 0.9063734120856334, 'weight_other': 1.1447210120578346, 'n_components': 17, 'min_cluster_size': 33, 'min_samples': 31}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  30%|‚ñà‚ñà‚ñà       | 15/50 [00:19<00:51,  1.47s/it]

[I 2025-11-20 14:51:34,906] Trial 14 finished with value: 0.26725592038450363 and parameters: {'weight_logistics': 1.3065351265597094, 'weight_province': 0.777270400935521, 'weight_other': 1.2076812453735513, 'n_components': 12, 'min_cluster_size': 28, 'min_samples': 32}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  32%|‚ñà‚ñà‚ñà‚ñè      | 16/50 [00:20<00:51,  1.53s/it]

[I 2025-11-20 14:51:36,573] Trial 15 finished with value: 0.2844147244317372 and parameters: {'weight_logistics': 1.9424374338437338, 'weight_province': 0.9999116288876788, 'weight_other': 1.4463423420914259, 'n_components': 20, 'min_cluster_size': 34, 'min_samples': 28}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  34%|‚ñà‚ñà‚ñà‚ñç      | 17/50 [00:22<00:50,  1.54s/it]

[I 2025-11-20 14:51:38,138] Trial 16 finished with value: -0.23306748291811158 and parameters: {'weight_logistics': 1.9174188735965516, 'weight_province': 0.9863619783183182, 'weight_other': 1.499629646357764, 'n_components': 20, 'min_cluster_size': 22, 'min_samples': 15}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  36%|‚ñà‚ñà‚ñà‚ñå      | 18/50 [00:23<00:46,  1.45s/it]

[I 2025-11-20 14:51:39,373] Trial 17 finished with value: 0.27705536610132453 and parameters: {'weight_logistics': 1.9769110175310398, 'weight_province': 0.556127438683145, 'weight_other': 1.4751331833470318, 'n_components': 17, 'min_cluster_size': 35, 'min_samples': 28}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  38%|‚ñà‚ñà‚ñà‚ñä      | 19/50 [00:25<00:46,  1.50s/it]

[I 2025-11-20 14:51:40,993] Trial 18 finished with value: 0.28008913453175444 and parameters: {'weight_logistics': 1.1078053474210923, 'weight_province': 0.7217705525996757, 'weight_other': 1.2719967259455502, 'n_components': 21, 'min_cluster_size': 29, 'min_samples': 28}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 12. Best value: 0.288727:  40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:26<00:42,  1.42s/it]

[I 2025-11-20 14:51:42,225] Trial 19 finished with value: 0.26239207673123943 and parameters: {'weight_logistics': 1.8095598924665262, 'weight_province': 0.9924094695376117, 'weight_other': 1.3755087617129538, 'n_components': 18, 'min_cluster_size': 11, 'min_samples': 28}. Best is trial 12 with value: 0.2887268370298813.


Best trial: 20. Best value: 0.304197:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 21/50 [00:28<00:41,  1.44s/it]

[I 2025-11-20 14:51:43,717] Trial 20 finished with value: 0.3041969187181969 and parameters: {'weight_logistics': 1.389686326725506, 'weight_province': 0.7904271768108824, 'weight_other': 1.1002274505584582, 'n_components': 15, 'min_cluster_size': 24, 'min_samples': 24}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 22/50 [00:29<00:40,  1.44s/it]

[I 2025-11-20 14:51:45,162] Trial 21 finished with value: 0.3016405829980799 and parameters: {'weight_logistics': 1.3573016285531532, 'weight_province': 0.8138533096989506, 'weight_other': 1.0758528933119829, 'n_components': 15, 'min_cluster_size': 23, 'min_samples': 26}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 23/50 [00:30<00:37,  1.39s/it]

[I 2025-11-20 14:51:46,422] Trial 22 finished with value: 0.2901015792938266 and parameters: {'weight_logistics': 1.3093888965107696, 'weight_province': 0.8062224228553193, 'weight_other': 1.0676221937643324, 'n_components': 12, 'min_cluster_size': 23, 'min_samples': 24}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 24/50 [00:31<00:34,  1.33s/it]

[I 2025-11-20 14:51:47,616] Trial 23 finished with value: 0.27986685252778465 and parameters: {'weight_logistics': 1.4470150570074551, 'weight_province': 0.6458560095930946, 'weight_other': 1.0527293515952254, 'n_components': 11, 'min_cluster_size': 23, 'min_samples': 24}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:33<00:32,  1.29s/it]

[I 2025-11-20 14:51:48,809] Trial 24 finished with value: 0.2718095224764343 and parameters: {'weight_logistics': 1.2846890384757603, 'weight_province': 0.8022115685298037, 'weight_other': 1.1357596595961976, 'n_components': 12, 'min_cluster_size': 25, 'min_samples': 25}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 26/50 [00:34<00:31,  1.33s/it]

[I 2025-11-20 14:51:50,220] Trial 25 finished with value: -0.07142598962352531 and parameters: {'weight_logistics': 1.3403809326857201, 'weight_province': 0.789760631932525, 'weight_other': 1.0307897378812148, 'n_components': 15, 'min_cluster_size': 21, 'min_samples': 20}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 27/50 [00:35<00:31,  1.35s/it]

[I 2025-11-20 14:51:51,624] Trial 26 finished with value: -0.4353205289999627 and parameters: {'weight_logistics': 1.7734054515033657, 'weight_province': 0.644067560520946, 'weight_other': 1.15647709151293, 'n_components': 8, 'min_cluster_size': 19, 'min_samples': 17}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 28/50 [00:37<00:29,  1.34s/it]

[I 2025-11-20 14:51:52,956] Trial 27 finished with value: 0.28960012016018855 and parameters: {'weight_logistics': 1.439716448641582, 'weight_province': 0.8259541406868295, 'weight_other': 1.221855477677881, 'n_components': 13, 'min_cluster_size': 24, 'min_samples': 25}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 29/50 [00:38<00:28,  1.34s/it]

[I 2025-11-20 14:51:54,302] Trial 28 finished with value: 0.28538488038684173 and parameters: {'weight_logistics': 1.228103621079386, 'weight_province': 0.7137959386022009, 'weight_other': 0.7660640212925187, 'n_components': 16, 'min_cluster_size': 28, 'min_samples': 22}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [00:39<00:25,  1.26s/it]

[I 2025-11-20 14:51:55,349] Trial 29 finished with value: 0.25035773899337266 and parameters: {'weight_logistics': 2.210309100300326, 'weight_province': 0.517177403819146, 'weight_other': 0.9866128901143341, 'n_components': 8, 'min_cluster_size': 27, 'min_samples': 23}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 31/50 [00:40<00:23,  1.26s/it]

[I 2025-11-20 14:51:56,622] Trial 30 finished with value: 0.11399355970718311 and parameters: {'weight_logistics': 1.149297271676711, 'weight_province': 0.6637981227223674, 'weight_other': 1.0948879171609651, 'n_components': 5, 'min_cluster_size': 11, 'min_samples': 26}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 32/50 [00:42<00:23,  1.32s/it]

[I 2025-11-20 14:51:58,072] Trial 31 finished with value: 0.2996293713431125 and parameters: {'weight_logistics': 1.473170767247251, 'weight_province': 0.7918822004175807, 'weight_other': 1.237985206142662, 'n_components': 13, 'min_cluster_size': 24, 'min_samples': 24}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 33/50 [00:43<00:22,  1.35s/it]

[I 2025-11-20 14:51:59,486] Trial 32 finished with value: 0.24967013718537948 and parameters: {'weight_logistics': 1.4505704259586265, 'weight_province': 0.9478644603754097, 'weight_other': 1.2872790936826544, 'n_components': 10, 'min_cluster_size': 23, 'min_samples': 23}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 34/50 [00:45<00:22,  1.41s/it]

[I 2025-11-20 14:52:01,050] Trial 33 finished with value: -0.18698112171857456 and parameters: {'weight_logistics': 1.4546009260230928, 'weight_province': 0.7554192882014551, 'weight_other': 0.9841432650185451, 'n_components': 13, 'min_cluster_size': 19, 'min_samples': 21}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 20. Best value: 0.304197:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:46<00:21,  1.44s/it]

[I 2025-11-20 14:52:02,572] Trial 34 finished with value: 0.29861747238807385 and parameters: {'weight_logistics': 1.752017610861197, 'weight_province': 0.8503374631965939, 'weight_other': 1.089489319689155, 'n_components': 15, 'min_cluster_size': 25, 'min_samples': 24}. Best is trial 20 with value: 0.3041969187181969.


Best trial: 35. Best value: 0.312475:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 36/50 [00:48<00:19,  1.41s/it]

[I 2025-11-20 14:52:03,910] Trial 35 finished with value: 0.31247468016034785 and parameters: {'weight_logistics': 1.753983415297383, 'weight_province': 1.0704530272930746, 'weight_other': 0.5173429012220171, 'n_components': 15, 'min_cluster_size': 30, 'min_samples': 27}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 37/50 [00:49<00:18,  1.43s/it]

[I 2025-11-20 14:52:05,393] Trial 36 finished with value: 0.2998472380417138 and parameters: {'weight_logistics': 2.113850651533817, 'weight_province': 1.0990357755341627, 'weight_other': 0.5373903544705597, 'n_components': 18, 'min_cluster_size': 30, 'min_samples': 27}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 38/50 [00:51<00:16,  1.39s/it]

[I 2025-11-20 14:52:06,668] Trial 37 finished with value: 0.26497311831380194 and parameters: {'weight_logistics': 2.591557119327641, 'weight_province': 1.0429600771430139, 'weight_other': 0.5162654097634436, 'n_components': 18, 'min_cluster_size': 31, 'min_samples': 27}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 39/50 [00:52<00:15,  1.41s/it]

[I 2025-11-20 14:52:08,142] Trial 38 finished with value: 0.26135903164392565 and parameters: {'weight_logistics': 2.1284850306254737, 'weight_province': 1.2363917732734484, 'weight_other': 0.5358372030008689, 'n_components': 18, 'min_cluster_size': 30, 'min_samples': 27}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [00:53<00:13,  1.38s/it]

[I 2025-11-20 14:52:09,440] Trial 39 finished with value: 0.29370002028084546 and parameters: {'weight_logistics': 2.3748968274294295, 'weight_province': 1.084374857277106, 'weight_other': 0.6206636858497415, 'n_components': 16, 'min_cluster_size': 36, 'min_samples': 29}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 41/50 [00:55<00:13,  1.47s/it]

[I 2025-11-20 14:52:11,119] Trial 40 finished with value: 0.2422187675072176 and parameters: {'weight_logistics': 2.9872633032642737, 'weight_province': 1.3534053899763627, 'weight_other': 0.6175429125122917, 'n_components': 23, 'min_cluster_size': 31, 'min_samples': 26}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 42/50 [00:56<00:11,  1.48s/it]

[I 2025-11-20 14:52:12,615] Trial 41 finished with value: 0.275926321279277 and parameters: {'weight_logistics': 2.08886694610858, 'weight_province': 1.1778707754575657, 'weight_other': 0.571633277846236, 'n_components': 16, 'min_cluster_size': 26, 'min_samples': 22}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 35. Best value: 0.312475:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 43/50 [00:58<00:10,  1.53s/it]

[I 2025-11-20 14:52:14,267] Trial 42 finished with value: 0.2670324648995958 and parameters: {'weight_logistics': 1.6030861779203862, 'weight_province': 1.0865790336423717, 'weight_other': 0.6979786580932438, 'n_components': 19, 'min_cluster_size': 27, 'min_samples': 25}. Best is trial 35 with value: 0.31247468016034785.


Best trial: 43. Best value: 0.331491:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 44/50 [00:59<00:08,  1.46s/it]

[I 2025-11-20 14:52:15,553] Trial 43 finished with value: 0.3314906249430592 and parameters: {'weight_logistics': 1.8413242126986944, 'weight_province': 0.939514101313094, 'weight_other': 0.6547357986754325, 'n_components': 15, 'min_cluster_size': 29, 'min_samples': 27}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [01:01<00:06,  1.40s/it]

[I 2025-11-20 14:52:16,808] Trial 44 finished with value: 0.2784788596648555 and parameters: {'weight_logistics': 2.27673533116584, 'weight_province': 0.9500234132443929, 'weight_other': 0.6950892238065061, 'n_components': 15, 'min_cluster_size': 29, 'min_samples': 27}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 46/50 [01:02<00:05,  1.45s/it]

[I 2025-11-20 14:52:18,373] Trial 45 finished with value: 0.30197283247052004 and parameters: {'weight_logistics': 1.8295229740873062, 'weight_province': 1.0545706130751846, 'weight_other': 0.5014867820140668, 'n_components': 16, 'min_cluster_size': 32, 'min_samples': 32}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 47/50 [01:04<00:04,  1.49s/it]

[I 2025-11-20 14:52:19,948] Trial 46 finished with value: 0.3137888588126408 and parameters: {'weight_logistics': 1.6922135749137903, 'weight_province': 1.042619568819659, 'weight_other': 0.6428004962869317, 'n_components': 14, 'min_cluster_size': 38, 'min_samples': 33}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 48/50 [01:05<00:02,  1.45s/it]

[I 2025-11-20 14:52:21,322] Trial 47 finished with value: 0.3173574089344624 and parameters: {'weight_logistics': 1.8680202827217436, 'weight_province': 1.0401395665915647, 'weight_other': 0.6564098459168977, 'n_components': 14, 'min_cluster_size': 38, 'min_samples': 33}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 49/50 [01:06<00:01,  1.34s/it]

[I 2025-11-20 14:52:22,404] Trial 48 finished with value: 0.2805516264982611 and parameters: {'weight_logistics': 1.6779743469762818, 'weight_province': 1.1480099449831245, 'weight_other': 0.7601863130046245, 'n_components': 10, 'min_cluster_size': 38, 'min_samples': 34}. Best is trial 43 with value: 0.3314906249430592.


Best trial: 43. Best value: 0.331491: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [01:07<00:00,  1.36s/it]

[I 2025-11-20 14:52:23,644] Trial 49 finished with value: 0.31011489967050854 and parameters: {'weight_logistics': 1.8765742776677792, 'weight_province': 0.941398964926587, 'weight_other': 0.653961464661173, 'n_components': 13, 'min_cluster_size': 37, 'min_samples': 33}. Best is trial 43 with value: 0.3314906249430592.

‚úÖ ¬°Optimizaci√≥n lista!
Mejores pesos encontrados:
  - Log√≠stica (Tiempos): 1.84
  - Provincias: 0.94





In [11]:
import pandas as pd
import numpy as np
import hdbscan
from sklearn.decomposition import PCA

# ==========================================
# 1. RECUPERAR PAR√ÅMETROS Y PESOS
# ==========================================
best_params = study.best_params
print(f"Mejor Score: {study.best_value:.4f}")
print("Mejores par√°metros:", best_params)

# Recuperamos los pesos ganadores (usamos .get(..., 1.0) por si alguno no existiera)
w_geo = 1  # El ancla fija que definimos
w_log = best_params.get("weight_logistics", 1.0)
w_cat = best_params.get("weight_province", 1.0)
w_oth = best_params.get("weight_other", 1.0)

# ==========================================
# 2. APLICAR LOS PESOS A LA MATRIZ X
# ==========================================
# Creamos una copia para no alterar la original
X_final_weighted = X_scaled.copy()

# Multiplicamos las columnas por sus pesos optimizados
# (Asumiendo que tienes las listas de √≠ndices: geo_idxs, log_idxs, etc. definidas)
if geo_idxs: X_final_weighted[:, geo_idxs] *= w_geo
if log_idxs: X_final_weighted[:, log_idxs] *= w_log
if cat_idxs: X_final_weighted[:, cat_idxs] *= w_cat
if other_idxs: X_final_weighted[:, other_idxs] *= w_oth

print("‚úÖ Pesos aplicados a la matriz de datos.")

# ==========================================
# 3. PCA FINAL (Sobre la matriz pesada)
# ==========================================
pca_final = PCA(n_components=best_params["n_components"], random_state=42)
# ¬°OJO! Aqu√≠ usamos X_final_weighted, NO X_scaled directo
X_pca_final = pca_final.fit_transform(X_final_weighted)

# ==========================================
# 4. HDBSCAN FINAL
# ==========================================
best_clusterer = hdbscan.HDBSCAN(
    min_cluster_size=best_params["min_cluster_size"],
    min_samples=best_params["min_samples"],
    metric='euclidean',
    cluster_selection_method='eom',
    gen_min_span_tree=True  # Recomendado dejarlo True por si quieres validar despu√©s
).fit(X_pca_final)

# ==========================================
# 5. ASIGNACI√ìN DE ETIQUETAS
# ==========================================
# Inicializamos con -2 (valor seguro para "no procesado")
df['cluster_opt'] = -2

# Asignamos las etiquetas usando los √≠ndices v√°lidos
# Aseg√∫rate de que 'valid_indices' existe (viene del paso de limpieza de nulos)
df.loc[valid_indices, 'cluster_opt'] = best_clusterer.labels_

# Reporte
n_clusters_final = len(set(best_clusterer.labels_)) - (1 if -1 in best_clusterer.labels_ else 0)
print(f"\nüéØ Modelo Final Generado.")
print(f"   Clusters encontrados: {n_clusters_final}")
print(f"   Ruido (puntos sin cluster): {sum(best_clusterer.labels_ == -1)}")

Mejor Score: 0.3315
Mejores par√°metros: {'weight_logistics': 1.8413242126986944, 'weight_province': 0.939514101313094, 'weight_other': 0.6547357986754325, 'n_components': 15, 'min_cluster_size': 29, 'min_samples': 27}
‚úÖ Pesos aplicados a la matriz de datos.

üéØ Modelo Final Generado.
   Clusters encontrados: 43
   Ruido (puntos sin cluster): 1509


In [12]:
# 1. RECREAR valid_indices
# Hacemos exactamente el mismo filtro que cuando creamos la matriz X
# (Seleccionar num√©ricos -> Borrar Nulos -> Guardar el √≠ndice)
df_used_for_model = df_encoded.select_dtypes(include=[np.number]).dropna()
valid_indices = df_used_for_model.index

print(f"√çndices recuperados: {len(valid_indices)} filas coinciden con el modelo.")

# 2. ASIGNAR ETIQUETAS (Ahora s√≠ funcionar√°)
# Inicializamos con -2 (para identificar si alguna fila qued√≥ fuera)
df['cluster_opt'] = -2

# Asignamos usando los √≠ndices. Pandas se encarga de emparejar cada ID con su cluster.
df.loc[valid_indices, 'cluster_opt'] = best_clusterer.labels_

# 3. REPORTE FINAL
n_clusters_final = len(set(best_clusterer.labels_)) - (1 if -1 in best_clusterer.labels_ else 0)
print(f"\nüéØ Asignaci√≥n exitosa.")
print(f"   Clusters encontrados: {n_clusters_final}")
print(f"   Ruido: {sum(df['cluster_opt'] == -1)}")

√çndices recuperados: 7908 filas coinciden con el modelo.

üéØ Asignaci√≥n exitosa.
   Clusters encontrados: 43
   Ruido: 1509


In [13]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

# ==========================================
# 1. PREPARAR COLORES √öNICOS (Sin repetir)
# ==========================================
# Filtramos los datos (quitamos nulos y ruido para asignar colores)
plot_data = df[df['cluster_opt'] != -2].copy()
# Obtenemos la lista de clusters reales ordenados
clusters_reales = sorted(list(set(plot_data['cluster_opt']) - {-1}))
n_clusters = len(clusters_reales)

print(f"Generando paleta √∫nica para {n_clusters} clusters...")

# Usamos 'husl' de Seaborn: permite generar N colores visualmente distintos
# Es mucho mejor que 'tab20' cuando tienes muchos grupos.
palette = sns.color_palette("husl", n_clusters)
# Convertimos a un diccionario {ID_Cluster: Color}
color_map = {cluster: palette[i] for i, cluster in enumerate(clusters_reales)}
# Asignamos un gris claro para el ruido (-1)
color_map[-1] = (0.8, 0.8, 0.8) # Gris



# ==========================================
# 3. MAPA "CHILENO" REAL (Interactivo - Plotly)
# ==========================================
# Este es el mejor para ver "Chile" porque pone el mapa geogr√°fico de fondo.
print("Generando mapa interactivo de Chile...")

# Convertimos la columna a string para que Plotly la trate como categor√≠a (colores discretos)
plot_data['Cluster_String'] = plot_data['cluster_opt'].astype(str)

fig = px.scatter_mapbox(
    plot_data[plot_data['cluster_opt'] != -1], # Omitimos ruido para limpiar la vista
    lat="Latitud",
    lon="Longitud",
    color="Cluster_String",
    color_discrete_sequence=px.colors.qualitative.Dark24, # Paleta de alto contraste
    # Si tienes nombres de faena, descomenta la siguiente l√≠nea:
    # hover_name="NombreFaena", hover_data=["ProvinciaFaena"],
    zoom=4,
    center={"lat": -30.0, "lon": -71.0}, # Centrado aprox en Coquimbo/La Serena
    height=900,
    title="Mapa de Clusters Mineros (Fondo Geogr√°fico)"
)

# Estilo del mapa (OpenStreetMap es gratuito y detallado)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.update_traces(marker=dict(size=8)) # Puntos un poco m√°s grandes

fig.show()

Generando paleta √∫nica para 43 clusters...
Generando mapa interactivo de Chile...


  fig = px.scatter_mapbox(


In [14]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import pandas as pd

# --- CONFIGURACI√ìN ---
# Umbral para unir centroides.
# 0.1 grados = ~11 km
# 0.2 grados = ~22 km (Si los centros de dos clusters est√°n a <22km, se unen)
MERGE_THRESHOLD = 0.4

# Definimos expl√≠citamente las columnas
col_origen = 'cluster_opt'   # La columna que gener√≥ Optuna
col_destino = 'cluster_merged' # La nueva columna con la fusi√≥n

print(f"--- Iniciando fusi√≥n basada en CENTROIDES (Umbral: {MERGE_THRESHOLD}¬∞) ---")

# 1. Validaci√≥n de seguridad
if col_origen not in df.columns:
    print(f"‚ö†Ô∏è Advertencia: '{col_origen}' no existe. Buscando alternativas...")
    if 'cluster_final' in df.columns: col_origen = 'cluster_final'
    elif 'cluster' in df.columns: col_origen = 'cluster'
    print(f"   -> Usando columna: '{col_origen}'")

# 2. Calcular el Centroide de cada cluster
# Ignoramos el ruido (-1) y los nulos (-2)
unique_labels = set(df[col_origen]) - {-1, -2}
centroids = []
label_map = []

for label in unique_labels:
    # Tomamos todas las minas de ESTE cluster espec√≠fico
    cluster_data = df[df[col_origen] == label]
    points = cluster_data[['Latitud', 'Longitud']].values
    
    # Calculamos el punto medio geogr√°fico (Centroide)
    centroid = points.mean(axis=0)
    
    centroids.append(centroid)
    label_map.append(label)

# 3. Aplicar la Uni√≥n (Clustering sobre Centroides)
if centroids:
    centroids = np.array(centroids)
    
    # AgglomerativeClustering agrupa los CENTROIDES que est√°n cerca.
    # linkage='complete' garantiza que en el grupo resultante, 
    # todos los centroides disten menos de MERGE_THRESHOLD entre s√≠.
    agg = AgglomerativeClustering(
        n_clusters=None, 
        metric='euclidean', 
        linkage='complete', 
        distance_threshold=MERGE_THRESHOLD
    )
    
    # Obtenemos las nuevas etiquetas simplificadas
    merged_labels = agg.fit_predict(centroids)
    
    # 4. Mapear {Viejo -> Nuevo}
    merge_dict = {old: new for old, new in zip(label_map, merged_labels)}
    
    # El ruido y los nulos se quedan igual
    merge_dict[-1] = -1
    merge_dict[-2] = -2
    
    # 5. Aplicar al DataFrame
    df[col_destino] = df[col_origen].map(merge_dict)
    
    # Reporte
    n_before = len(unique_labels)
    n_after = len(set(merged_labels))
    print(f"‚úÖ Fusi√≥n completada.")
    print(f"   Clusters Originales: {n_before}")
    print(f"   Clusters Fusionados: {n_after}")
    print(f"   Se unieron {n_before - n_after} clusters vecinos.")

else:
    print("‚ö†Ô∏è No se encontraron clusters v√°lidos para fusionar.")

--- Iniciando fusi√≥n basada en CENTROIDES (Umbral: 0.4¬∞) ---
‚úÖ Fusi√≥n completada.
   Clusters Originales: 43
   Clusters Fusionados: 26
   Se unieron 17 clusters vecinos.


In [15]:
from hdbscan.validity import validity_index
from sklearn.metrics import silhouette_score
import numpy as np

print("--- Evaluando Calidad de los Clusters Finales ---")

# 1. PREPARAR DATOS
# Necesitamos las etiquetas y la matriz de datos original (X_pca_final o X_scaled)
# Aseg√∫rate de usar la misma matriz con la que entrenaste el modelo.
if 'X_pca_final' in locals():
    data_for_score = X_pca_final
    print("   Usando datos PCA ponderados (X_pca_final).")
else:
    data_for_score = X_scaled
    print("   ‚ö†Ô∏è Variable 'X_pca_final' no encontrada. Usando 'X_scaled'.")

# Obtenemos las etiquetas finales (fusionadas)
# Es importante filtrar el ruido (-1) para el Silhouette, pero DBCV lo maneja.
labels_final = df.loc[valid_indices, 'cluster_merged'].values

# 2. CALCULAR DBCV (Density Based Clustering Validation)
# Este es el "dbscore" que usa Optuna.
try:
    dbcv_score = validity_index(data_for_score, labels_final, metric='euclidean')
    print(f"\n‚úÖ DBCV Score Final: {dbcv_score:.4f}")
    print("   (Rango: -1 a 1. Mayor es mejor. Es normal que sea menor al de Optuna tras la fusi√≥n)")
except Exception as e:
    print(f"\n‚ùå No se pudo calcular DBCV: {e}")

# 3. CALCULAR SILHOUETTE SCORE (Opcional pero recomendado)
# Mide qu√© tan "separados" est√°n los clusters visualmente.
# Silhouette ignora el ruido (-1) para no castigar injustamente.
mask_valid = labels_final != -1
if np.sum(mask_valid) > 0:
    sil_score = silhouette_score(data_for_score[mask_valid], labels_final[mask_valid])
    print(f"‚úÖ Silhouette Score: {sil_score:.4f}")
    print("   (Rango: -1 a 1. Indica qu√© tan compactos y separados est√°n los grupos)")
else:
    print("‚ö†Ô∏è No hay suficientes datos agrupados para calcular Silhouette.")

# 4. COMPARATIVA
print(f"\nResumen:")
print(f"   Clusters Totales: {len(set(labels_final) - {-1})}")
print(f"   Puntos de Ruido: {np.sum(labels_final == -1)}")

--- Evaluando Calidad de los Clusters Finales ---
   Usando datos PCA ponderados (X_pca_final).

‚úÖ DBCV Score Final: 0.2790
   (Rango: -1 a 1. Mayor es mejor. Es normal que sea menor al de Optuna tras la fusi√≥n)
‚úÖ Silhouette Score: 0.4704
   (Rango: -1 a 1. Indica qu√© tan compactos y separados est√°n los grupos)

Resumen:
   Clusters Totales: 26
   Puntos de Ruido: 1509


In [16]:
import plotly.express as px

# =============================================================================
# 5. VISUALIZACI√ìN CON MAPA DE CHILE (PLOTLY)
# =============================================================================

# 1. Preparar los datos
# Filtramos los nulos (-2)
plot_data = df[df['cluster_merged'] != -2].copy()

# Convertimos la columna a texto (String) para que el mapa asigne 
# colores diferentes a cada grupo, en lugar de una barra de color num√©rica.
plot_data['Cluster_ID'] = plot_data['cluster_merged'].astype(str)

# Ordenamos para que la leyenda salga ordenada
plot_data = plot_data.sort_values('cluster_merged')

# 2. Generar el Mapa
fig = px.scatter_mapbox(
    plot_data,
    lat="Latitud",
    lon="Longitud",
    color="Cluster_ID",
    
    # Asignamos gris claro ('lightgray') espec√≠ficamente al Ruido (-1)
    # y dejamos que Plotly asigne colores vibrantes al resto.
    color_discrete_map={'-1': 'lightgray'},
    
    # Datos que aparecer√°n al pasar el mouse sobre un punto
    hover_name="Cluster_ID", 
    hover_data=["ProvinciaFaena", "cluster_opt"], 
    
    # Configuraci√≥n de la vista inicial (Norte Chico/Grande)
    zoom=4.5,
    center={"lat": -28.0, "lon": -70.5}, 
    
    height=900,
    title=f"Mapa de Zonas Log√≠sticas (Fusi√≥n {MERGE_THRESHOLD}¬∞)"
)

# 3. Estilo del Mapa
# "carto-positron" es un mapa limpio, ideal para tesis. 
# Tambi√©n puedes usar "open-street-map".
fig.update_layout(mapbox_style="carto-positron")

# Ajustar tama√±o de los puntos y m√°rgenes
fig.update_traces(marker=dict(size=9, opacity=0.8))
fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.update_layout(legend_title_text='Zona Log√≠stica (ID)')

# Mostrar
fig.show()

# Opcional: Guardar como archivo HTML interactivo para la tesis
# fig.write_html("Mapa_Clusters_Fusionados.html")


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

