In [1]:
!pip install river scikit-learn mlflow numpy==1.23.5

Collecting river
  Downloading river-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl (24.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.23.5
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<3.0.0,>=2.2.3
  Downloading pandas-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<2.0.0,>=1.14.1
  Downloading scipy-1.15.3-cp310-cp

In [2]:
import numpy as np
#from skmultiflow.clustering import DenStream
from river.cluster import DenStream
import mlflow
import pickle
from datetime import datetime
import os
import logging
import glob
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [3]:
# Configurar logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
def find_latest_model():
    models_base_dir = "DENStream_Model"
    
    if not os.path.exists(models_base_dir):
        logger.info("No existe directorio de modelos")
        return None, 0

    pattern = os.path.join(models_base_dir, "**/denstream_*.pkl")
    model_files = glob.glob(pattern, recursive=True)
    
    if not model_files:
        logger.info("No se encontraron modelos existentes")
        return None, 0

    model_files.sort(key=os.path.getmtime, reverse=True)
    latest_model = model_files[0]

    model_dir = os.path.dirname(latest_model)
    version_files = glob.glob(os.path.join(model_dir, "denstream_v*.pkl"))
    
    if version_files:
        # Extraer números de versión existentes
        versions = []
        for vf in version_files:
            try:
                version_num = int(os.path.basename(vf).split('_v')[1].split('.pkl')[0])
                versions.append(version_num)
            except:
                continue
        next_version = max(versions) + 1 if versions else 1
    else:
        next_version = 1
    
    logger.info(f"Último modelo encontrado: {latest_model}")
    logger.info(f"Nueva versión será: v{next_version}")
    
    return latest_model, next_version

In [5]:
def train_denstream_model(df_scaled, config=None):
    if config is None:
        config = {
            'epsilon': 0.5,
            'beta': 0.2,
            'mu': 3,
            'decaying_factor': 0.001
        }

    if config['beta'] >= config['epsilon']:
        logger.warning(f"Beta ({config['beta']}) debe ser menor que epsilon ({config['epsilon']})")
        config['beta'] = config['epsilon'] * 0.5

    today = datetime.now().strftime("%Y-%m-%d")

    latest_model_path, next_version = find_latest_model()

    if latest_model_path:
        model_dir = os.path.dirname(latest_model_path)
    else:
        model_dir = f"DENStream_Model"
        os.makedirs(model_dir, exist_ok=True)
    
    new_model_path = f"{model_dir}/denstream_v{next_version}.pkl"

    features = ['x_km', 'y_km', 'altitude_km', 'time_minutes']

    # validaciones
    if not all(col in df_scaled.columns for col in features):
        raise ValueError(f"Faltan columnas: {features}")
    
    if df_scaled[features].isnull().any().any():
        logger.warning("Se encontraron valores nulos, se eliminarán")
        df_scaled = df_scaled.dropna(subset=features)
    
    if len(df_scaled) == 0:
        raise ValueError("No hay datos para procesar despues de limpiar")
    
    X_new = df_scaled[features].values
    
    if np.any(np.isinf(X_new)):
        logger.error("Se encontraron valores infinitos en los datos")
        X_new = X_new[~np.any(np.isinf(X_new), axis=1)]
        logger.info(f"Datos despues de eliminar infinitos: {len(X_new)}")
    
    if np.any(np.isnan(X_new)):
        logger.error("Se encontraron valores NaN en los datos")
        X_new = X_new[~np.any(np.isnan(X_new), axis=1)]
        logger.info(f"Datos despues de eliminar NaN: {len(X_new)}")

    std_values = np.std(X_new, axis=0)
    if np.any(std_values == 0):
        logger.warning("Algunas características tienen varianza cero:")
        for i, std_val in enumerate(std_values):
            if std_val == 0:
                logger.warning(f"  {features[i]}: std = {std_val}")
e
    logger.info("Estadisticas de datos:")
    for i, feature in enumerate(features):
        min_val, max_val, mean_val, std_val = np.min(X_new[:, i]), np.max(X_new[:, i]), np.mean(X_new[:, i]), np.std(X_new[:, i])
        logger.info(f"  {feature}: min={min_val:.4f}, max={max_val:.4f}, mean={mean_val:.4f}, std={std_val:.4f}")
e
    if len(X_new) > 100:
        sample_indices = np.random.choice(len(X_new), 100, replace=False)
        sample_data = X_new[sample_indices]
    else:
        sample_data = X_new
    
    from sklearn.metrics.pairwise import euclidean_distances
    distances = euclidean_distances(sample_data)
    upper_tri = distances[np.triu_indices_from(distances, k=1)]
    avg_distance = np.mean(upper_tri)
    min_distance = np.min(upper_tri[upper_tri > 0])
    
    logger.info(f"Distancia promedio entre puntos: {avg_distance:.4f}")
    logger.info(f"Distancia minima entre puntos: {min_distance:.4f}")

    if config['epsilon'] > avg_distance:
        suggested_epsilon = avg_distance * 0.1
        logger.warning(f"Epsilon ({config['epsilon']}) parece muy grande. Sugerido: {suggested_epsilon:.4f}")
        config['epsilon'] = suggested_epsilon
        config['beta'] = config['epsilon'] * 0.5
    
    logger.info(f"Procesando {len(X_new)} muestras con parámetros: {config}")
    
    # Iniciar run en MLFlow
    with mlflow.start_run(run_name=f"denstream_v{next_version}_{today}"):
        
        try:
            if latest_model_path and os.path.exists(latest_model_path):
                logger.info(f"Cargando modelo existente: {latest_model_path}")
                
                try:
                    with open(latest_model_path, "rb") as f:
                        clusterer = pickle.load(f)
                    
                    logger.info("Modelo cargado exitosamente")
                    mlflow.log_param("base_model", os.path.basename(latest_model_path))
                    mlflow.log_param("is_retrained_model", True)
                    
                except Exception as e:
                    logger.error(f"Error cargando modelo: {e}")
                    logger.info("Creando nuevo modelo debido al error de carga")
                    clusterer = DenStream(
                        epsilon=config['epsilon'], 
                        beta=config['beta'], 
                        mu=config['mu'], 
                        decaying_factor=config['decaying_factor']
                    )
                    mlflow.log_param("is_retrained_model", False)
                    mlflow.log_param("creation_reason", "load_error")
                    
            else:
                logger.info("Creando primer modelo DenStream")
                clusterer = DenStream(
                    epsilon=config['epsilon'], 
                    beta=config['beta'], 
                    mu=config['mu'], 
                    decaying_factor=config['decaying_factor']
                )
                mlflow.log_param("is_retrained_model", False)
                mlflow.log_param("creation_reason", "first_model")
            
            for param, value in config.items():
                mlflow.log_param(param, value)
            
            mlflow.log_param("model_version", next_version)
            mlflow.log_param("training_samples", len(X_new))
            mlflow.log_param("avg_distance", avg_distance)
            mlflow.log_param("min_distance", min_distance)
            
            logger.info("Iniciando reentrenamiento incremental")
            
            processed_samples = 0
            failed_samples = 0
            
            try:
                if len(X_new) > 1000:
                    batch_size = 50
                    for i in range(0, len(X_new), batch_size):
                        batch = X_new[i:i+batch_size]
                        
                        for j, row in enumerate(batch):
                            try:
                                if not np.any(np.isnan(row)) and not np.any(np.isinf(row)):
                                    clusterer.learn_one(dict(zip(features, row)))
                                    processed_samples += 1
                                else:
                                    failed_samples += 1
                                    logger.warning(f"Muestra inválida en posición {i+j}: {row}")
                            except Exception as e:
                                failed_samples += 1
                                logger.warning(f"Error procesando muestra {i+j}: {e}")
                                continue
                        
                        if i % 500 == 0:
                            logger.info(f"Procesado lote {i//batch_size + 1}/{(len(X_new)-1)//batch_size + 1}")
                            logger.info(f"Muestras procesadas: {processed_samples}, Fallidas: {failed_samples}")
                else:
                    for i, row in enumerate(X_new):
                        try:
                            if not np.any(np.isnan(row)) and not np.any(np.isinf(row)):
                                clusterer.learn_one(dict(zip(features, row)))
                                processed_samples += 1
                            else:
                                failed_samples += 1
                                logger.warning(f"Muestra inválida en posición {i}: {row}")
                        except Exception as e:
                            failed_samples += 1
                            logger.warning(f"Error procesando muestra {i}: {e}")
                            continue
                        
                        if i % 100 == 0 and i > 0:
                            logger.info(f"Procesadas {i}/{len(X_new)} muestras")
                
                logger.info(f"Reentrenamiento completado: {processed_samples} exitosas, {failed_samples} fallidas")
                
            except Exception as e:
                logger.error(f"Error crítico durante el reentrenamiento: {e}")
                raise
            
            try:
                n_clusters = len(clusterer.micro_clusters) if hasattr(clusterer, 'micro_clusters') else 0
                logger.info(f"Número de micro-clusters: {n_clusters}")
            except Exception as e:
                logger.warning(f"No se pudo obtener información de clusters: {e}")
                n_clusters = 0
            
            mlflow.log_metric("n_microclusters", n_clusters)
            mlflow.log_metric("n_samples_processed", processed_samples)
            mlflow.log_metric("n_samples_failed", failed_samples)
            mlflow.log_metric("success_rate", processed_samples / len(X_new) if len(X_new) > 0 else 0)
            mlflow.log_metric("model_version", next_version)
            mlflow.log_metric("timestamp", datetime.now().timestamp())
            
            try:
                with open(new_model_path, "wb") as f:
                    pickle.dump(clusterer, f)
                mlflow.log_artifact(new_model_path, artifact_path="model")
                logger.info(f"Nuevo modelo guardado en: {new_model_path}")
            except Exception as e:
                logger.error(f"Error guardando modelo: {e}")
                raise
            
            summary = {
                'date': today,
                'version': next_version,
                'base_model': os.path.basename(latest_model_path) if latest_model_path else None,
                'n_samples': len(X_new),
                'n_samples_processed': processed_samples,
                'n_samples_failed': failed_samples,
                'success_rate': processed_samples / len(X_new) if len(X_new) > 0 else 0,
                'n_microclusters': n_clusters,
                'model_path': new_model_path,
                'features': features,
                'config': config,
                'model_dir': model_dir,
                'avg_distance': avg_distance,
                'min_distance': min_distance
            }
            
            logger.info(f"Reentrenamiento completado: {summary}")
            return clusterer, summary
            
        except Exception as e:
            logger.error(f"Error durante el reentrenamiento: {e}")
            mlflow.log_param("error", str(e))
            raise

In [6]:
def get_model_history():
    models_base_dir = "DENStream_Model"
    
    if not os.path.exists(models_base_dir):
        return []
    
    pattern = os.path.join(models_base_dir, "**/denstream_*.pkl")
    model_files = glob.glob(pattern, recursive=True)
    
    history = []
    for model_file in model_files:
        try:
            stat = os.stat(model_file)
            history.append({
                'path': model_file,
                'name': os.path.basename(model_file),
                'directory': os.path.dirname(model_file),
                'size_mb': stat.st_size / (1024 * 1024),
                'created': datetime.fromtimestamp(stat.st_ctime),
                'modified': datetime.fromtimestamp(stat.st_mtime)
            })
        except Exception as e:
            logger.warning(f"Error procesando {model_file}: {e}")

    history.sort(key=lambda x: x['modified'], reverse=True)
    return history

In [7]:

def evaluate_model(clusterer, X_test, save_results=True):
    try:
        predictions = []
        for row in X_test:
            pred = clusterer.predict([row])
            predictions.append(pred[0] if pred is not None and len(pred) > 0 else -1)
        
        predictions = np.array(predictions)

        unique_labels = np.unique(predictions)
        n_clusters_found = len(unique_labels[unique_labels != -1])
        noise_ratio = np.sum(predictions == -1) / len(predictions)
        
        results = {
            'n_samples': len(X_test),
            'n_clusters_found': n_clusters_found,
            'noise_ratio': noise_ratio,
            'predictions': predictions
        }
        
        logger.info(f"Evaluación completada: {n_clusters_found} clusters, {noise_ratio:.2%} ruido")
        return results
        
    except Exception as e:
        logger.error(f"Error en evaluación: {e}")
        return None

In [8]:
if __name__ == "__main__":
    custom_config = {
        'epsilon': 0.5,
        'beta': 0.2,
        'mu': 6,
        'decaying_factor': 0.001
    }

    def validate_denstream_params(config):
        """Valida los parámetros de DenStream"""
        errors = []

        if config['beta'] >= config['epsilon']:
            errors.append(f"beta ({config['beta']}) debe ser menor que epsilon ({config['epsilon']})")

        mu_beta_product = config['mu'] * config['beta']
        if mu_beta_product <= 1:
            errors.append(f"mu * beta ({mu_beta_product}) debe ser > 1 para evitar errores matemáticos")

        for param, value in config.items():
            if value <= 0:
                errors.append(f"{param} ({value}) debe ser > 0")
        
        return errors

    validation_errors = validate_denstream_params(custom_config)
    if validation_errors:
        logger.error("Errores de validación de parámetros:")
        for error in validation_errors:
            logger.error(f"  - {error}")

        logger.info("Usando configuración alternativa válida...")
        custom_config = {
            'epsilon': 0.5,
            'beta': 0.2,
            'mu': 6,
            'decaying_factor': 0.001
        }

    mu_beta_check = custom_config['mu'] * custom_config['beta']
    logger.info(f"Verificación: mu * beta = {custom_config['mu']} * {custom_config['beta']} = {mu_beta_check}")
    
    if mu_beta_check <= 1:
        logger.error(f"CRÍTICO: mu * beta = {mu_beta_check} <= 1. Ajustando parámetros...")
        custom_config['mu'] = max(6, int(1.5 / custom_config['beta']) + 1)
        logger.info(f"Nuevo mu ajustado: {custom_config['mu']}")
    
    try:
        history = get_model_history()
        if history:
            logger.info("Historial de modelos:")
            for model in history[:5]:
                logger.info(f"  - {model['name']} ({model['modified'].strftime('%Y-%m-%d %H:%M')})")

        df = pd.read_csv('DENStream_Datasets/max_denstream_preprocessed.csv')
        
        logger.info(f"Iniciando entrenamiento con parámetros validados: {custom_config}")
        clusterer, summary = train_denstream_model(df, custom_config)
        logger.info(f"Modelo v{summary['version']} creado exitosamente")

        logger.info("=== RESUMEN DEL ENTRENAMIENTO ===")
        logger.info(f"Versión del modelo: v{summary['version']}")
        logger.info(f"Muestras procesadas: {summary['n_samples_processed']}/{summary['n_samples']}")
        logger.info(f"Tasa de éxito: {summary['success_rate']:.2%}")
        logger.info(f"Micro-clusters creados: {summary['n_microclusters']}")
        logger.info(f"Modelo guardado en: {summary['model_path']}")
        
    except Exception as e:
        logger.error(f"Error en ejecución principal: {e}")
        import traceback
        logger.error(f"Traceback completo: {traceback.format_exc()}")

def test_denstream_configurations():

    configurations = [
        {
            'name': 'conservadora',
            'epsilon': 0.5,
            'beta': 0.2,
            'mu': 6,
            'decaying_factor': 0.001
        },
        {
            'name': 'estricta',
            'epsilon': 0.3,
            'beta': 0.1,
            'mu': 12,
            'decaying_factor': 0.001
        },
        {
            'name': 'permisiva',
            'epsilon': 0.8,
            'beta': 0.3,
            'mu': 4,
            'decaying_factor': 0.001
        }
    ]
    
    for config in configurations:
        name = config.pop('name')
        mu_beta = config['mu'] * config['beta']
        
        print(f"\n=== Configuración {name.upper()} ===")
        print(f"Parámetros: {config}")
        print(f"mu * beta = {mu_beta} ({'✓ VÁLIDO' if mu_beta > 1 else '✗ INVÁLIDO'})")
        
        if mu_beta > 1:
            print("Esta configuración debería funcionar correctamente.")
        else:
            print("Esta configuración causará un error matemático.")

print("=== PRUEBA DE CONFIGURACIONES ===")
test_denstream_configurations()

INFO:__main__:Verificación: mu * beta = 6 * 0.2 = 1.2000000000000002
INFO:__main__:Historial de modelos:
INFO:__main__:  - denstream_v0.pkl (2025-06-23 00:56)
INFO:__main__:Iniciando entrenamiento con parámetros validados: {'epsilon': 0.5, 'beta': 0.2, 'mu': 6, 'decaying_factor': 0.001}
INFO:__main__:Último modelo encontrado: DENStream_Model/denstream_v0.pkl
INFO:__main__:Nueva versión será: v1
INFO:__main__:Estadísticas de datos:
INFO:__main__:  x_km: min=-2.7676, max=1.7235, mean=-0.0000, std=1.0000
INFO:__main__:  y_km: min=-2.8663, max=2.5239, mean=0.0000, std=1.0000
INFO:__main__:  altitude_km: min=-2.0026, max=7.1977, mean=-0.0000, std=1.0000
INFO:__main__:  time_minutes: min=-0.6584, max=2.0201, mean=0.0000, std=1.0000
INFO:__main__:Distancia promedio entre puntos: 2.5991
INFO:__main__:Distancia mínima entre puntos: 0.0192
INFO:__main__:Procesando 15622 muestras con parámetros: {'epsilon': 0.5, 'beta': 0.2, 'mu': 6, 'decaying_factor': 0.001}
INFO:__main__:Cargando modelo existen

In [12]:
!mlflow ui --port 5001

[2025-06-23 02:20:39 +0000] [320] [INFO] Starting gunicorn 23.0.0
[2025-06-23 02:20:39 +0000] [320] [INFO] Listening at: http://127.0.0.1:5001 (320)
[2025-06-23 02:20:39 +0000] [320] [INFO] Using worker: sync
[2025-06-23 02:20:39 +0000] [321] [INFO] Booting worker with pid: 321
[2025-06-23 02:20:39 +0000] [322] [INFO] Booting worker with pid: 322
[2025-06-23 02:20:39 +0000] [323] [INFO] Booting worker with pid: 323
[2025-06-23 02:20:39 +0000] [324] [INFO] Booting worker with pid: 324
^C
[2025-06-23 02:49:10 +0000] [320] [INFO] Handling signal: int
[2025-06-23 02:49:10 +0000] [321] [INFO] Worker exiting (pid: 321)
[2025-06-23 02:49:10 +0000] [324] [INFO] Worker exiting (pid: 324)
[2025-06-23 02:49:10 +0000] [322] [INFO] Worker exiting (pid: 322)
[2025-06-23 02:49:10 +0000] [323] [INFO] Worker exiting (pid: 323)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b1c0c563-baec-4f70-b443-fb48e15d9efd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>