In [19]:
import numpy as np
import pandas as pd
import hdbscan
import optuna
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from hdbscan.validity import validity_index
from sklearn.cluster import AgglomerativeClustering
import warnings

# Ignorar warnings molestos
warnings.filterwarnings("ignore")

# 1. Cargar y Limpiar B√°sicamente (Igual que antes)
df = pd.read_csv('NeoModelos/minas_con_tiempos_puertos.csv') # Ajusta tu ruta si es necesario
df = df[df['Estado'] == 'ACTIVA']
df = df[(df['RecursoPrimarioInstalacion'] == 'COBRE') | (df['RecursoMineroInstalacion'] == 'SALMUERA (LITIO)')] 

# Eliminar columnas innecesarias (versi√≥n resumida de tu c√≥digo)
cols_to_drop = ['RutEmpresa','NombreEmpresa','RecursoMineroInstalacion','TipoInstalacion',
                'TipoRecursoInstalacion','RecursoPrimarioInstalacion', 'ComunaFaena', 
                'NombreFaena', 'CategoriaFaena', 'IdFaena', 'ProvinciaInstalacion', 
                'ComunaInstalacion','NombreInstalacion','IdTipoInstalacion','IdInstalacion',
                'Norte','Este','Huso','Datum','IdEstado','Estado']
# Eliminar distancias
cols_to_drop += [c for c in df.columns if c.startswith('dist_')]
df = df.drop(columns=cols_to_drop, errors='ignore')



In [20]:
# ==========================================
# 2. NUEVO PASO: ELIMINACI√ìN POR CORRELACI√ìN (>99.9%)
# ==========================================
CORRELATION_THRESHOLD = 0.999
print(f"\n--- Buscando Columnas Redundantes (Corr > {CORRELATION_THRESHOLD}) ---")

# 1. Calcular matriz de correlaci√≥n (solo num√©ricas)
df_numeric = df.select_dtypes(include=[np.number])
corr_matrix = df_numeric.corr().abs()

# 2. Seleccionar el tri√°ngulo superior de la matriz
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 3. Identificar columnas a borrar
to_drop = [column for column in upper.columns if any(upper[column] > CORRELATION_THRESHOLD)]

if len(to_drop) > 0:
    print(f"‚ö†Ô∏è Se encontraron {len(to_drop)} columnas redundantes para eliminar.")
    # Ejemplo de las primeras 5
    print(f"   Ejemplos: {to_drop[:5]} ...")
    
    # 4. Eliminar del DataFrame original
    df = df.drop(columns=to_drop)
    print(f"‚úÖ Columnas eliminadas. Nueva dimensi√≥n: {df.shape}")
else:
    print("‚úÖ No se encontr√≥ redundancia excesiva.")


--- Buscando Columnas Redundantes (Corr > 0.999) ---
‚ö†Ô∏è Se encontraron 37 columnas redundantes para eliminar.
   Ejemplos: ['time_Vi√±a_del_Mar_min', 'time_Coquimbo_min', 'time_Talca_min', 'time_San_Bernardo_min', 'time_Curic√≥_min'] ...
‚úÖ Columnas eliminadas. Nueva dimensi√≥n: (7930, 23)


In [21]:
# --- ESTRATEGIA DIVIDE Y VENCER√ÅS ---
REGIONES_NORTE = ['XV', 'I', 'II']
REGIONES_SUR   = ['III', 'IV', 'V', 'RM', 'VI', 'VII']

print(f"--- Separando Chile en 2 Modelos ---")
df_norte = df[df['RegionFaena'].isin(REGIONES_NORTE)].copy()
df_sur   = df[df['RegionFaena'].isin(REGIONES_SUR)].copy()

print(f"1. Modelo Norte (XV, I, II): {len(df_norte)} instalaciones")
print(f"2. Modelo Sur (III a VII):   {len(df_sur)} instalaciones (Zona Densa)")

print(f"--- Divisi√≥n Geogr√°fica ---")
print(f"Zona Norte (Antofagasta y arriba): {len(df_norte)} minas")
print(f"Zona Sur (Atacama y abajo): {len(df_sur)} minas")

--- Separando Chile en 2 Modelos ---
1. Modelo Norte (XV, I, II): 1652 instalaciones
2. Modelo Sur (III a VII):   6278 instalaciones (Zona Densa)
--- Divisi√≥n Geogr√°fica ---
Zona Norte (Antofagasta y arriba): 1652 minas
Zona Sur (Atacama y abajo): 6278 minas


In [22]:
import numpy as np
import pandas as pd
import hdbscan
import optuna
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from hdbscan.validity import validity_index

def entrenar_modelo_regional(df_region, nombre_zona, n_trials=30, param_ranges=None):
    """
    Entrena un modelo HDBSCAN optimizado para una regi√≥n espec√≠fica.
    
    Args:
        df_region (pd.DataFrame): Datos de la regi√≥n.
        nombre_zona (str): Nombre para identificar la zona en los logs.
        n_trials (int): N√∫mero de intentos de optimizaci√≥n.
        param_ranges (dict, optional): Diccionario con los rangos de b√∫squeda.
            Claves aceptadas: 'w_log', 'w_cat', 'w_oth', 'min_cluster', 'min_samples'.
            Cada valor debe ser una tupla (min, max).
            Ejemplo: {'min_cluster': (5, 20), 'w_log': (1.0, 5.0)}
    """
    
    # --- Configuraci√≥n de Rangos por Defecto ---
    # Estos se usan si no env√≠as nada en param_ranges
    defaults = {
        'w_log': (1.0, 3.5),
        'w_cat': (0.5, 2.0),
        'w_oth': (0.5, 1.5),
        'min_cluster': (10, 40),
        'min_samples': (5, 30)
    }
    
    # Actualizar rangos con lo que env√≠e el usuario (si env√≠a algo)
    ranges = defaults.copy()
    if param_ranges:
        ranges.update(param_ranges)

    print(f"\nüöÄ Iniciando optimizaci√≥n para: {nombre_zona}")
    print(f"   ‚öôÔ∏è Rangos config: {ranges}")
    
    # 1. Preprocesamiento Espec√≠fico de la Regi√≥n
    df_encoded = pd.get_dummies(df_region, columns=['ProvinciaFaena'], drop_first=True, dtype=int)
    df_model = df_encoded.select_dtypes(include=[np.number]).dropna()
    
    X = df_model.values
    valid_indices = df_model.index
    feature_names = df_model.columns.tolist()
    
    # Scaling local
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Identificar √≠ndices
    geo_idxs = [i for i, col in enumerate(feature_names) if col in ['Latitud', 'Longitud']]
    log_idxs = [i for i, col in enumerate(feature_names) if col.startswith('time_') or col.startswith('Tiempo_Prt_')]
    cat_idxs = [i for i, col in enumerate(feature_names) if col.startswith('ProvinciaFaena_')]
    other_idxs = [i for i in range(len(feature_names)) if i not in geo_idxs + log_idxs + cat_idxs]

    # 2. Funci√≥n Objetivo
    def objective(trial):
        # Pesos din√°micos usando los rangos configurados
        w_geo = 1.0
        w_log = trial.suggest_float("w_log", ranges['w_log'][0], ranges['w_log'][1]) 
        w_cat = trial.suggest_float("w_cat", ranges['w_cat'][0], ranges['w_cat'][1])
        w_oth = trial.suggest_float("w_oth", ranges['w_oth'][0], ranges['w_oth'][1])

        X_weighted = X_scaled.copy()
        if geo_idxs: X_weighted[:, geo_idxs] *= w_geo
        if log_idxs: X_weighted[:, log_idxs] *= w_log
        if cat_idxs: X_weighted[:, cat_idxs] *= w_cat
        if other_idxs: X_weighted[:, other_idxs] *= w_oth

        # PCA Din√°mico
        max_comp = min(20, X.shape[1])
        n_components = trial.suggest_int("n_components", 5, max_comp)
        pca = PCA(n_components=n_components, random_state=42)
        X_pca = pca.fit_transform(X_weighted)

        # HDBSCAN Din√°mico usando los rangos configurados
        min_cluster = trial.suggest_int("min_cluster", ranges['min_cluster'][0], ranges['min_cluster'][1])
        min_samples = trial.suggest_int("min_samples", ranges['min_samples'][0], ranges['min_samples'][1])

        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster,
            min_samples=min_samples,
            metric='euclidean',
            gen_min_span_tree=True,
            cluster_selection_method='leaf',
        ).fit(X_pca)
        
        labels = clusterer.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        
        # Filtro de seguridad: al menos 3 clusters para considerar v√°lido el modelo
        if n_clusters < 3: return -1.0

        try:
            score = validity_index(X_pca, labels, metric='euclidean')
        except:
            score = -1.0
            
        # Penalizaci√≥n por ruido excesivo
        if np.sum(labels == -1) / len(labels) > 0.4:
            score -= 0.2

        return score

    # 3. Ejecutar Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"‚úÖ Mejor Score {nombre_zona}: {study.best_value:.4f}")
    print(f"   (Params: {study.best_params})")
    
    # 4. Reconstruir el mejor modelo
    bp = study.best_params
    
    X_final = X_scaled.copy()
    if geo_idxs: X_final[:, geo_idxs] *= 1.0
    if log_idxs: X_final[:, log_idxs] *= bp['w_log']
    if cat_idxs: X_final[:, cat_idxs] *= bp['w_cat']
    if other_idxs: X_final[:, other_idxs] *= bp['w_oth']
    
    pca_final = PCA(n_components=bp['n_components'], random_state=42)
    X_pca_final = pca_final.fit_transform(X_final)
    
    final_clusterer = hdbscan.HDBSCAN(
        min_cluster_size=bp['min_cluster'],
        min_samples=bp['min_samples'],
        metric='euclidean'
    ).fit(X_pca_final)
    
    return final_clusterer.labels_, valid_indices

In [23]:
# --- CONFIGURACI√ìN PERSONALIZADA ---

# Configuraci√≥n para el Norte (Disperso -> clusters m√°s grandes, menos peso a provincias)
config_norte = {
    'min_cluster': (10, 30),    # Clusters m√°s grandes
    'min_samples': (10, 25),    # M√°s exigente para formar n√∫cleos
}

# Configuraci√≥n para el Sur (Denso -> clusters peque√±os, m√°s detalle local)
config_sur = {
    'min_cluster': (5, 25),     # Clusters peque√±os permitidos (peque√±os valles)
    'min_samples': (3, 15),     # Menos exigente para detectar grupos chicos
}

# --- EJECUCI√ìN ---

# 1. Entrenar Norte
labels_norte, idx_norte = entrenar_modelo_regional(
    df_norte, 
    "ZONA NORTE", 
    n_trials=30, 
    param_ranges=config_norte
)

# 2. Entrenar Sur
labels_sur, idx_sur = entrenar_modelo_regional(
    df_sur, 
    "ZONA SUR", 
    n_trials=30, 
    param_ranges=config_sur
)

# (Luego contin√∫as con la unificaci√≥n y el scan igual que antes)

[I 2025-11-19 17:03:14,674] A new study created in memory with name: no-name-08b0475d-8c54-418c-a1ab-1ddc8b2c10b5



üöÄ Iniciando optimizaci√≥n para: ZONA NORTE
   ‚öôÔ∏è Rangos config: {'w_log': (1.0, 3.5), 'w_cat': (0.5, 2.0), 'w_oth': (0.5, 1.5), 'min_cluster': (10, 30), 'min_samples': (10, 25)}


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-19 17:03:14,911] Trial 0 finished with value: 0.3151703264676218 and parameters: {'w_log': 1.988574949446071, 'w_cat': 0.80025173727667, 'w_oth': 1.1960748875042282, 'n_components': 14, 'min_cluster': 23, 'min_samples': 24}. Best is trial 0 with value: 0.3151703264676218.
[I 2025-11-19 17:03:15,203] Trial 1 finished with value: 0.27956436364614745 and parameters: {'w_log': 1.7013274332628205, 'w_cat': 0.8611572913732588, 'w_oth': 1.4914499012415492, 'n_components': 14, 'min_cluster': 30, 'min_samples': 13}. Best is trial 0 with value: 0.3151703264676218.
[I 2025-11-19 17:03:15,365] Trial 2 finished with value: 0.25002651260937075 and parameters: {'w_log': 1.8748607400171764, 'w_cat': 0.591830412479255, 'w_oth': 1.3111823842948915, 'n_components': 14, 'min_cluster': 27, 'min_samples': 23}. Best is trial 0 with value: 0.3151703264676218.
[I 2025-11-19 17:03:15,586] Trial 3 finished with value: 0.3750128939547532 and parameters: {'w_log': 1.2659119938651533, 'w_cat': 1.84711824

[I 2025-11-19 17:03:21,721] A new study created in memory with name: no-name-6aa3c7de-52d3-40a7-926c-fbff5dab078a


[I 2025-11-19 17:03:21,664] Trial 29 finished with value: 0.41733477345295333 and parameters: {'w_log': 2.0808762835336707, 'w_cat': 1.952275757112276, 'w_oth': 0.5159217071000646, 'n_components': 16, 'min_cluster': 14, 'min_samples': 15}. Best is trial 4 with value: 0.43764428271931216.
‚úÖ Mejor Score ZONA NORTE: 0.4376
   (Params: {'w_log': 2.2040808099625124, 'w_cat': 0.9624651851331969, 'w_oth': 1.4561239681560525, 'n_components': 5, 'min_cluster': 13, 'min_samples': 15})

üöÄ Iniciando optimizaci√≥n para: ZONA SUR
   ‚öôÔ∏è Rangos config: {'w_log': (1.0, 3.5), 'w_cat': (0.5, 2.0), 'w_oth': (0.5, 1.5), 'min_cluster': (5, 25), 'min_samples': (3, 15)}


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-19 17:03:23,249] Trial 0 finished with value: 0.08508409653461046 and parameters: {'w_log': 1.0757890800247851, 'w_cat': 1.509628456749071, 'w_oth': 1.1487145933442404, 'n_components': 13, 'min_cluster': 15, 'min_samples': 12}. Best is trial 0 with value: 0.08508409653461046.
[I 2025-11-19 17:03:26,112] Trial 1 finished with value: 0.332676963382605 and parameters: {'w_log': 2.3468379201660956, 'w_cat': 0.548850521699163, 'w_oth': 1.176575594393484, 'n_components': 11, 'min_cluster': 11, 'min_samples': 8}. Best is trial 1 with value: 0.332676963382605.
[I 2025-11-19 17:03:34,624] Trial 2 finished with value: 0.44509097060605585 and parameters: {'w_log': 1.9564339543220508, 'w_cat': 0.8690390817255376, 'w_oth': 0.5770835170231662, 'n_components': 12, 'min_cluster': 7, 'min_samples': 5}. Best is trial 2 with value: 0.44509097060605585.
[I 2025-11-19 17:03:36,034] Trial 3 finished with value: 0.26734969109601814 and parameters: {'w_log': 2.7470489045019555, 'w_cat': 1.858400445

In [24]:
# --- UNIFICACI√ìN DE RESULTADOS ---
df['cluster_raw'] = -2 # Inicializar

# Asignar Norte (Mantenemos etiquetas originales)
df.loc[idx_norte, 'cluster_raw'] = labels_norte

# Asignar Sur (Desplazamos los IDs para que no choquen con los del Norte)
# Encontramos el ID m√°s alto del norte (ignorando ruido -1)
max_id_norte = max(set(labels_norte) - {-1}) if len(set(labels_norte) - {-1}) > 0 else 0
offset = max_id_norte + 1

# Funci√≥n lambda para desplazar solo si no es ruido (-1)
shifted_labels_sur = [x + offset if x >= 0 else -1 for x in labels_sur]
df.loc[idx_sur, 'cluster_raw'] = shifted_labels_sur

print("\n‚úÖ Modelos unificados exitosamente.")
print(f"Total Clusters Brutos: {len(set(df['cluster_raw']) - {-1, -2})}")


‚úÖ Modelos unificados exitosamente.
Total Clusters Brutos: 599


In [25]:
# --- FUSI√ìN DE CLUSTERS POR RADIO (CENTROIDES) ---
col_analisis = 'cluster_raw'
col_final = 'cluster_final'
RADIO_FUSION = 0.25  # Grados (~25-30km). Ajusta esto si quieres unir valles m√°s grandes.

# 1. Calcular Centroides
unique_labels = set(df[col_analisis]) - {-1, -2}
centroids = []
label_map_list = []

for label in unique_labels:
    # Obtener lat/lon promedio del cluster
    data = df[df[col_analisis] == label][['Latitud', 'Longitud']].values
    centroid = data.mean(axis=0)
    centroids.append(centroid)
    label_map_list.append(label)

# 2. Clustering Aglomerativo sobre los Centroides
if centroids:
    centroids = np.array(centroids)
    # Usamos 'complete' linkage para asegurar que todos los miembros del nuevo grupo
    # est√©n dentro de la distancia umbral entre s√≠.
    agg_cluster = AgglomerativeClustering(
        n_clusters=None,
        metric='euclidean',
        linkage='complete', 
        distance_threshold=RADIO_FUSION
    )
    merged_ids = agg_cluster.fit_predict(centroids)
    
    # Crear diccionario de mapeo {Cluster_Viejo: Cluster_Nuevo}
    mapping_dict = {old: new for old, new in zip(label_map_list, merged_ids)}
    # Mapear ruido y nulos
    mapping_dict[-1] = -1
    mapping_dict[-2] = -2
    
    # Aplicar
    df[col_final] = df[col_analisis].map(mapping_dict)
    
    n_final = len(set(df[col_final]) - {-1, -2})
    print(f"‚úÖ Fusi√≥n por Radio completada ({RADIO_FUSION}¬∞).")
    print(f"   Clusters Iniciales: {len(unique_labels)}")
    print(f"   Clusters Finales: {n_final}")
else:
    df[col_final] = df[col_analisis]
    print("‚ö†Ô∏è No hay clusters suficientes para fusionar.")

# --- VISUALIZACI√ìN ---
import plotly.express as px

# Filtrar ruido para el mapa (opcional)
plot_data = df[df[col_final] != -2].copy()
plot_data['Cluster_ID'] = plot_data[col_final].astype(str)
plot_data = plot_data.sort_values(col_final)

fig = px.scatter_mapbox(
    plot_data,
    lat="Latitud",
    lon="Longitud",
    color="Cluster_ID",
    color_discrete_map={'-1': 'lightgray'}, # Ruido en gris
    hover_name="Cluster_ID",
    hover_data=["ProvinciaFaena"],
    zoom=4.5,
    center={"lat": -28.0, "lon": -70.0},
    title="Modelo Dual (Norte/Sur) con Fusi√≥n de Radio",
    height=900
)

fig.update_layout(mapbox_style="carto-positron")
fig.show()

‚úÖ Fusi√≥n por Radio completada (0.25¬∞).
   Clusters Iniciales: 599
   Clusters Finales: 132
