# 04d - Análisis Temporal y Modelos de Series de Tiempo

**Objetivo:** Desarrollar modelos temporales para analizar la evolución del riesgo de Alzheimer
 
**Enfoque**: 
- Series temporales para predicción de composite_risk_score
- Análisis de tendencias en biomarcadores
- Patrones temporales en actividad/sueño
- Modelos de supervivencia para progresión del riesgo

---

## Importar librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configuración MLflow
mlflow.set_experiment("temporal_analysis_alzheimer")

print("🕒 Iniciando Análisis Temporal - Monitorización Multimodal Alzheimer")
print("=" * 60)

## 1. Carga y Preparación de Datos Temporales

In [None]:
def load_temporal_data():
    """Carga datos con estructura temporal simulada"""
    # Cargar datos principales
    try:
        df = pd.read_csv('../data/processed/fe_master_dataset.csv')
        print(f"✅ Dataset cargado: {df.shape}")
    except:
        print("⚠️  Generando datos sintéticos para demostración")
        # Datos sintéticos basados en el análisis previo
        np.random.seed(42)
        n_subjects = 1000
        n_timepoints = 5  # 5 visitas por sujeto
        
        data = []
        for subject_id in range(n_subjects):
            base_risk = np.random.uniform(0.1, 0.8)
            base_age = np.random.uniform(60, 85)
            apoe_status = np.random.choice([0, 1], p=[0.7, 0.3])
            
            for visit in range(n_timepoints):
                # Simular progresión temporal
                time_factor = visit * 0.1
                noise = np.random.normal(0, 0.05)
                
                row = {
                    'subject_id': f'SUBJ_{subject_id:04d}',
                    'visit_number': visit + 1,
                    'months_from_baseline': visit * 6,  # Visitas cada 6 meses
                    'composite_risk_score': np.clip(base_risk + time_factor + noise, 0, 1),
                    'age_standardized': base_age + visit * 0.5,
                    'APOE_e4_present': apoe_status,
                    'biomarker_risk_score': np.clip(base_risk * 1.2 + time_factor + noise, 0, 1),
                    'sleep_minutes_mean': np.random.normal(420 - visit * 10, 30),
                    'sleep_disruptions_mean': np.random.poisson(2 + visit),
                    'steps_mean': np.random.normal(8000 - visit * 200, 1000),
                    'CDRSB_LOG': np.random.exponential(0.5 + visit * 0.1),
                    'ABETA42': np.random.normal(200 - visit * 5, 20),
                    'tau_pathology_score': np.random.uniform(0, 1) + visit * 0.05
                }
                data.append(row)
        
        df = pd.DataFrame(data)
        print(f"✅ Datos sintéticos generados: {df.shape}")
    
    return df

df_temporal = load_temporal_data()

# Información básica del dataset temporal
print(f"\n📊 Información del Dataset Temporal:")
print(f"   Sujetos únicos: {df_temporal['subject_id'].nunique() if 'subject_id' in df_temporal.columns else 'N/A'}")
print(f"   Rango temporal: {df_temporal['months_from_baseline'].min() if 'months_from_baseline' in df_temporal.columns else 'N/A'} - {df_temporal['months_from_baseline'].max() if 'months_from_baseline' in df_temporal.columns else 'N/A'} meses")
print(f"   Variables temporales detectadas: {len([col for col in df_temporal.columns if any(temp in col.lower() for temp in ['mean', 'std', 'min', 'max'])])}")


## 2. Análisis de Patrones Temporales

In [None]:
def analyze_temporal_patterns(df):
    """Analiza patrones temporales en las variables clave"""
    
    print("🔍 Analizando Patrones Temporales...")
    
    # Variables clave para análisis temporal
    temporal_vars = [
        'composite_risk_score', 'biomarker_risk_score', 'sleep_minutes_mean',
        'sleep_disruptions_mean', 'steps_mean', 'CDRSB_LOG', 'ABETA42', 'tau_pathology_score'
    ]
    
    # Filtrar variables que existen en el dataset
    available_vars = [var for var in temporal_vars if var in df.columns]
    
    if not available_vars:
        print("⚠️  No se encontraron variables temporales específicas")
        # Usar las primeras 8 variables numéricas
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        available_vars = numeric_cols[:8].tolist()
    
    print(f"   Variables analizadas: {len(available_vars)}")
    
    # Calcular tendencias por sujeto (si hay estructura temporal)
    trends_data = {}
    
    if 'subject_id' in df.columns and 'months_from_baseline' in df.columns:
        for var in available_vars:
            if var in df.columns:
                # Calcular pendiente promedio por sujeto
                subject_trends = []
                for subject in df['subject_id'].unique():
                    subject_data = df[df['subject_id'] == subject].sort_values('months_from_baseline')
                    if len(subject_data) > 1:
                        x = subject_data['months_from_baseline'].values
                        y = subject_data[var].values
                        # Regresión lineal simple
                        slope = np.polyfit(x, y, 1)[0] if not np.isnan(y).all() else 0
                        subject_trends.append(slope)
                
                trends_data[var] = {
                    'mean_slope': np.mean(subject_trends) if subject_trends else 0,
                    'std_slope': np.std(subject_trends) if subject_trends else 0,
                    'progression_subjects': sum(1 for s in subject_trends if s > 0.01) if subject_trends else 0
                }
    
    return trends_data, available_vars

# Analizar patrones
trends_analysis, temporal_variables = analyze_temporal_patterns(df_temporal)

if trends_analysis:
    print("\n📈 Tendencias Temporales Detectadas:")
    for var, stats in trends_analysis.items():
        print(f"   {var}:")
        print(f"      Pendiente promedio: {stats['mean_slope']:.4f}")
        print(f"      Sujetos con progresión: {stats['progression_subjects']}")


## 3. Modelos de Series Temporales

In [None]:
class TemporalRiskModel:
    """Modelo temporal para predicción de riesgo de Alzheimer"""
    
    def __init__(self, model_type='autoregressive'):
        self.model_type = model_type
        self.models = {}
        self.scalers = {}
        self.feature_importance = {}
        
    def create_sequences(self, df, target_col, sequence_length=3, prediction_horizon=1):
        """Crear secuencias temporales para entrenamiento"""
        
        if 'subject_id' not in df.columns:
            print("⚠️  No hay estructura de sujetos, usando secuencias simples")
            # Crear secuencias simples
            X, y = [], []
            data = df[target_col].values
            
            for i in range(sequence_length, len(data) - prediction_horizon + 1):
                X.append(data[i-sequence_length:i])
                y.append(data[i:i+prediction_horizon])
            
            return np.array(X), np.array(y)
        
        # Crear secuencias por sujeto
        X, y = [], []
        feature_cols = [col for col in df.columns 
                       if col not in ['subject_id', 'visit_number', 'months_from_baseline']]
        
        for subject in df['subject_id'].unique():
            subject_data = df[df['subject_id'] == subject].sort_values('months_from_baseline')
            
            if len(subject_data) >= sequence_length + prediction_horizon:
                # Crear secuencias para este sujeto
                subject_features = subject_data[feature_cols].values
                subject_target = subject_data[target_col].values
                
                for i in range(sequence_length, len(subject_data) - prediction_horizon + 1):
                    # Características de ventana temporal
                    X_seq = subject_features[i-sequence_length:i].flatten()
                    y_seq = subject_target[i:i+prediction_horizon]
                    
                    if not np.isnan(X_seq).any() and not np.isnan(y_seq).any():
                        X.append(X_seq)
                        y.append(y_seq.mean())  # Predicción promedio del horizonte
        
        return np.array(X), np.array(y)
    
    def fit_autoregressive_model(self, X, y, model_name="AR_Model"):
        """Entrena modelo autoregresivo"""
        
        with mlflow.start_run(run_name=f"temporal_{model_name}"):
            # Escalado
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Modelo Random Forest para capturar no-linealidades temporales
            model = RandomForestRegressor(
                n_estimators=100,
                max_depth=10,
                random_state=42,
                n_jobs=-1
            )
            
            # Entrenamiento
            model.fit(X_scaled, y)
            
            # Almacenar
            self.models[model_name] = model
            self.scalers[model_name] = scaler
            
            # Feature importance si está disponible
            if hasattr(model, 'feature_importances_'):
                self.feature_importance[model_name] = model.feature_importances_
            
            # Métricas básicas en conjunto de entrenamiento
            y_pred = model.predict(X_scaled)
            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)
            r2 = r2_score(y, y_pred)
            
            # Log MLflow
            mlflow.log_param("model_type", "RandomForest")
            mlflow.log_param("n_estimators", 100)
            mlflow.log_param("sequence_length", X.shape[1])
            mlflow.log_metric("train_mse", mse)
            mlflow.log_metric("train_mae", mae)
            mlflow.log_metric("train_r2", r2)
            
            mlflow.sklearn.log_model(model, "temporal_model")
            
            print(f"✅ Modelo {model_name} entrenado:")
            print(f"   MSE: {mse:.4f}")
            print(f"   MAE: {mae:.4f}")
            print(f"   R²: {r2:.4f}")
            
            return model
    
    def fit_trend_model(self, df, target_col, model_name="Trend_Model"):
        """Modelo de tendencias temporales"""
        
        if 'months_from_baseline' not in df.columns:
            print("⚠️  No hay información temporal para modelo de tendencias")
            return None
            
        with mlflow.start_run(run_name=f"temporal_{model_name}"):
            # Preparar features temporales
            features = ['months_from_baseline']
            
            # Agregar features adicionales si están disponibles
            additional_features = ['age_standardized', 'APOE_e4_present', 'biomarker_risk_score']
            for feat in additional_features:
                if feat in df.columns:
                    features.append(feat)
            
            X = df[features].fillna(df[features].mean())
            y = df[target_col].fillna(df[target_col].mean())
            
            # Escalado
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            # Modelo Ridge para tendencias
            model = Ridge(alpha=1.0, random_state=42)
            model.fit(X_scaled, y)
            
            # Almacenar
            self.models[model_name] = model
            self.scalers[model_name] = scaler
            
            # Métricas
            y_pred = model.predict(X_scaled)
            mse = mean_squared_error(y, y_pred)
            mae = mean_absolute_error(y, y_pred)
            r2 = r2_score(y, y_pred)
            
            # Log MLflow
            mlflow.log_param("model_type", "Ridge")
            mlflow.log_param("alpha", 1.0)
            mlflow.log_param("features", features)
            mlflow.log_metric("train_mse", mse)
            mlflow.log_metric("train_mae", mae)
            mlflow.log_metric("train_r2", r2)
            
            mlflow.sklearn.log_model(model, "trend_model")
            
            print(f"✅ Modelo {model_name} entrenado:")
            print(f"   Features utilizadas: {len(features)}")
            print(f"   MSE: {mse:.4f}")
            print(f"   R²: {r2:.4f}")
            
            return model

# Inicializar modelo temporal
temporal_model = TemporalRiskModel()

print("🤖 Desarrollando Modelos Temporales...")

## 4. Entrenamiento de Modelos Autoregresivos

In [None]:
# Modelo autoregresivo para composite_risk_score
if 'composite_risk_score' in df_temporal.columns:
    print("\n🔄 Entrenando Modelo Autoregresivo...")
    
    # Crear secuencias temporales
    X_seq, y_seq = temporal_model.create_sequences(
        df_temporal, 
        'composite_risk_score', 
        sequence_length=3,
        prediction_horizon=1
    )
    
    if len(X_seq) > 0:
        print(f"   Secuencias creadas: {X_seq.shape}")
        
        # Entrenar modelo
        ar_model = temporal_model.fit_autoregressive_model(
            X_seq, y_seq, "Autoregressive_Risk"
        )
    else:
        print("⚠️  No se pudieron crear suficientes secuencias temporales")
        
        # Modelo simplificado con datos disponibles
        target_col = 'composite_risk_score'
        available_features = [col for col in df_temporal.columns 
                            if col != target_col and df_temporal[col].dtype in ['int64', 'float64']]
        
        if available_features:
            X_simple = df_temporal[available_features].fillna(df_temporal[available_features].mean())
            y_simple = df_temporal[target_col].fillna(df_temporal[target_col].mean())
            
            ar_model = temporal_model.fit_autoregressive_model(
                X_simple.values, y_simple.values, "Simple_Risk_Model"
            )

## 5. Modelos de Tendencias Temporales

In [None]:
# Modelo de tendencias
print("\n📈 Entrenando Modelo de Tendencias...")

if 'composite_risk_score' in df_temporal.columns:
    trend_model = temporal_model.fit_trend_model(
        df_temporal, 'composite_risk_score', "Risk_Trend"
    )

# Modelo específico para biomarcadores si están disponibles
biomarker_cols = ['biomarker_risk_score', 'ABETA42', 'tau_pathology_score']
available_biomarkers = [col for col in biomarker_cols if col in df_temporal.columns]

for biomarker in available_biomarkers:
    print(f"\n🧬 Modelo de tendencias para {biomarker}...")
    temporal_model.fit_trend_model(
        df_temporal, biomarker, f"Trend_{biomarker}"
    )

## 6. Análisis de Supervivencia Temporal

In [None]:
class SurvivalRiskModel:
    """Modelo de supervivencia para progresión del riesgo"""
    
    def __init__(self):
        self.model = None
        
    def create_survival_data(self, df, risk_threshold=0.6):
        """Crear datos para análisis de supervivencia"""
        
        if 'subject_id' not in df.columns or 'months_from_baseline' not in df.columns:
            print("⚠️  Datos insuficientes para análisis de supervivencia")
            return None, None, None
        
        survival_data = []
        
        for subject in df['subject_id'].unique():
            subject_data = df[df['subject_id'] == subject].sort_values('months_from_baseline')
            
            # Determinar tiempo hasta evento (riesgo alto)
            high_risk_visits = subject_data[subject_data['composite_risk_score'] >= risk_threshold]
            
            if len(high_risk_visits) > 0:
                # Evento ocurrió
                time_to_event = high_risk_visits.iloc[0]['months_from_baseline']
                event = 1
            else:
                # Censurado
                time_to_event = subject_data['months_from_baseline'].max()
                event = 0
            
            # Features del baseline
            baseline_features = subject_data.iloc[0]
            
            survival_row = {
                'subject_id': subject,
                'time_to_event': time_to_event,
                'event': event,
                'baseline_risk': baseline_features.get('composite_risk_score', 0),
                'age': baseline_features.get('age_standardized', 70),
                'apoe_status': baseline_features.get('APOE_e4_present', 0),
                'baseline_biomarker': baseline_features.get('biomarker_risk_score', 0)
            }
            
            survival_data.append(survival_row)
        
        survival_df = pd.DataFrame(survival_data)
        
        # Preparar features
        feature_cols = ['baseline_risk', 'age', 'apoe_status', 'baseline_biomarker']
        X = survival_df[feature_cols].fillna(survival_df[feature_cols].mean())
        T = survival_df['time_to_event']
        E = survival_df['event']
        
        return X, T, E
    
    def fit_cox_approximation(self, X, T, E, model_name="Cox_Approximation"):
        """Aproximación del modelo de Cox usando regresión"""
        
        with mlflow.start_run(run_name=f"survival_{model_name}"):
            # Como aproximación, usar tiempo ponderado por evento
            # Los eventos tienen más peso
            weights = np.where(E == 1, 2.0, 1.0)
            
            # Modelo Ridge ponderado
            from sklearn.linear_model import Ridge
            model = Ridge(alpha=0.1, random_state=42)
            
            # Ajustar con pesos
            sample_weight = weights
            model.fit(X, T, sample_weight=sample_weight)
            
            self.model = model
            
            # Métricas básicas
            T_pred = model.predict(X)
            mse = mean_squared_error(T, T_pred, sample_weight=sample_weight)
            
            # Log MLflow
            mlflow.log_param("model_type", "Weighted_Ridge_Survival")
            mlflow.log_param("alpha", 0.1)
            mlflow.log_param("events_observed", E.sum())
            mlflow.log_param("total_subjects", len(E))
            mlflow.log_metric("weighted_mse", mse)
            mlflow.log_metric("event_rate", E.mean())
            
            mlflow.sklearn.log_model(model, "survival_model")
            
            print(f"✅ Modelo de supervivencia entrenado:")
            print(f"   Eventos observados: {E.sum()}/{len(E)} ({E.mean():.2%})")
            print(f"   MSE ponderado: {mse:.4f}")
            
            return model

# Análisis de supervivencia
print("\n⏱️  Desarrollando Modelo de Supervivencia...")

survival_model = SurvivalRiskModel()
X_surv, T_surv, E_surv = survival_model.create_survival_data(df_temporal)

if X_surv is not None:
    print(f"   Sujetos para análisis: {len(X_surv)}")
    cox_model = survival_model.fit_cox_approximation(X_surv, T_surv, E_surv)
else:
    print("⚠️  Análisis de supervivencia no disponible con datos actuales")


## 7. Ensemble Temporal

In [None]:
class TemporalEnsemble:
    """Ensemble de modelos temporales"""
    
    def __init__(self):
        self.models = {}
        self.weights = {}
        
    def add_model(self, name, model, weight=1.0):
        """Agregar modelo al ensemble"""
        self.models[name] = model
        self.weights[name] = weight
        
    def predict_ensemble(self, X, scaler_dict=None):
        """Predicción ensemble ponderada"""
        
        if not self.models:
            return None
            
        predictions = []
        total_weight = 0
        
        for name, model in self.models.items():
            try:
                # Aplicar escalado si está disponible
                if scaler_dict and name in scaler_dict:
                    X_scaled = scaler_dict[name].transform(X)
                else:
                    X_scaled = X
                
                pred = model.predict(X_scaled)
                weight = self.weights[name]
                
                predictions.append(pred * weight)
                total_weight += weight
                
            except Exception as e:
                print(f"⚠️  Error en modelo {name}: {str(e)}")
                continue
        
        if predictions:
            ensemble_pred = np.sum(predictions, axis=0) / total_weight
            return ensemble_pred
        else:
            return None

# Crear ensemble temporal
print("\n🎯 Creando Ensemble Temporal...")

ensemble = TemporalEnsemble()

In [None]:
# Agregar modelos disponibles al ensemble
for model_name, model in temporal_model.models.items():
    ensemble.add_model(model_name, model, weight=1.0)
    print(f"   ✅ Agregado: {model_name}")

if survival_model.model is not None:
    ensemble.add_model("survival", survival_model.model, weight=0.5)
    print("   ✅ Agregado: Survival Model")

print(f"\n📊 Ensemble creado con {len(ensemble.models)} modelos")


## 8. Validación Temporal Básica

In [None]:
def temporal_validation_split(df, test_ratio=0.2):
    """Split temporal para validación"""
    
    if 'months_from_baseline' not in df.columns:
        # Split aleatorio si no hay información temporal
        from sklearn.model_selection import train_test_split
        return train_test_split(df, test_size=test_ratio, random_state=42)
    
    # Split temporal: últimas visitas para test
    time_threshold = df['months_from_baseline'].quantile(1 - test_ratio)
    
    train_df = df[df['months_from_baseline'] < time_threshold]
    test_df = df[df['months_from_baseline'] >= time_threshold]
    
    print(f"🔄 Split temporal:")
    print(f"   Entrenamiento: {len(train_df)} registros (< {time_threshold:.1f} meses)")
    print(f"   Prueba: {len(test_df)} registros (≥ {time_threshold:.1f} meses)")
    
    return train_df, test_df

# Realizar split temporal
train_temporal, test_temporal = temporal_validation_split(df_temporal)

print("\n✅ Validación temporal preparada")

## 9. Guardado de Modelos Temporales

In [None]:
import joblib
import os

# Crear directorio para modelos
os.makedirs('../models/temporal/', exist_ok=True)

print("💾 Guardando Modelos Temporales...")

# Guardar modelos individuales
for model_name, model in temporal_model.models.items():
    model_path = f'../models/temporal/{model_name}.joblib'
    joblib.dump(model, model_path)
    print(f"   ✅ {model_name} guardado")

In [None]:
# Guardar scalers
for scaler_name, scaler in temporal_model.scalers.items():
    scaler_path = f'../models/temporal/{scaler_name}_scaler.joblib'
    joblib.dump(scaler, scaler_path)
    print(f"   ✅ {scaler_name} scaler guardado")

# Guardar modelo de supervivencia
if survival_model.model is not None:
    survival_path = '../models/temporal/survival_model.joblib'
    joblib.dump(survival_model.model, survival_path)
    print("   ✅ Survival model guardado")

# Guardar ensemble
ensemble_path = '../models/temporal/temporal_ensemble.joblib'
joblib.dump(ensemble, ensemble_path)
print("   ✅ Temporal ensemble guardado")

## 10. Resumen del Desarrollo Temporal

In [None]:
def generate_temporal_summary():
    """Generar resumen del desarrollo de modelos temporales"""
    
    summary = {
        'models_developed': len(temporal_model.models),
        'ensemble_components': len(ensemble.models),
        'temporal_features': len(temporal_variables),
        'data_points': len(df_temporal),
        'unique_subjects': df_temporal['subject_id'].nunique() if 'subject_id' in df_temporal.columns else 'N/A'
    }
    
    print("\n" + "="*60)
    print("📋 RESUMEN DESARROLLO MODELOS TEMPORALES")
    print("="*60)
    print(f"🤖 Modelos desarrollados: {summary['models_developed']}")
    print(f"🎯 Componentes ensemble: {summary['ensemble_components']}")
    print(f"📊 Features temporales: {summary['temporal_features']}")
    print(f"💾 Puntos de datos: {summary['data_points']:,}")
    print(f"👥 Sujetos únicos: {summary['unique_subjects']}")
    
    print(f"\n🔧 Modelos específicos desarrollados:")
    for i, model_name in enumerate(temporal_model.models.keys(), 1):
        print(f"   {i}. {model_name}")
    
    if survival_model.model is not None:
        print(f"   {len(temporal_model.models) + 1}. Survival Analysis Model")
    
    print(f"\n📈 Capacidades temporales implementadas:")
    print(f"   ✅ Predicción autoregresiva")
    print(f"   ✅ Análisis de tendencias")
    print(f"   ✅ Ensemble temporal")
    if survival_model.model is not None:
        print(f"   ✅ Análisis de supervivencia")
    
    print(f"\n🎯 Próximos pasos:")
    print(f"   - Evaluación exhaustiva en 04e_model_evaluation.ipynb")
    print(f"   - Optimización de hiperparámetros")
    print(f"   - Validación cruzada temporal")
    print(f"   - Análisis de explicabilidad temporal")
    
    return summary

# Generar resumen final
temporal_summary = generate_temporal_summary()

print(f"\n🏁 Desarrollo de Modelos Temporales COMPLETADO")
print(f"   📁 Modelos guardados en: ../models/temporal/")
print(f"   📊 MLflow experiments: temporal_analysis_alzheimer")

---

**Abraham Tartalos**