# Train with time-300B 

We'll  training the two architectures (LSTM and Neural ODE) using time-300B taking windows of 7, 14 and 21 sequenses.

This will be the experiments that we are going to do for both architectures:

- Baseline $100 \%$ of dataset

- Baseline $10 \%$ of dataset

- Baseline $3 \%$ of dataset

- Destillation with Time-Moe $3 \%$ of dataset

In [1]:
# Cargar la data
# Seleccionar las ventanas de cada sequencia de serie
# Dividir el porcentaje con muestreos (Separar 80-20)
# Entrenar los modelos
# Hacer destilación


### Load data

In [2]:
import random
import pandas as pd
import numpy as np
from Time_MoE.time_moe.datasets.time_moe_dataset import TimeMoEDataset

# Importing custom functions
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(root_path)

from baseline.functions import load_data,create_intervals,create_windows,smape,smape_chunked
from sklearn.metrics import mean_absolute_error, mean_squared_error

def load_data_clean():
    ds = TimeMoEDataset(data_folder='Time-300B\healthcare',normalization_method='zero')

    verbose = True
    total = len(ds)
    valid_indices = []
    # Iterar y filtrar
    for i in range(total):
        try:
            seq = ds[i]  # seq es numpy.ndarray según comprobaste
        except Exception as e:
            # Si hay error al obtener la secuencia, lo avisamos y saltamos
            if verbose:
                print(f"Advertencia: no se pudo obtener ds[{i}]: {e}")
            continue
        
        # Comprobación: si todos los valores son NaN, lo descartamos
        # seq es numpy.ndarray; cuidado si dims especiales, pero np.isnan funcionará elementwise.
        try:
            if not np.all(np.isnan(seq)):
                valid_indices.append(i)
        except Exception as e:
            # En caso de que seq no sea array puro, convertir primero:
            try:
                arr = np.array(seq)
                if not np.all(np.isnan(arr)):
                    valid_indices.append(i)
            except Exception as e2:
                if verbose:
                    print(f"Error al verificar NaN en secuencia índice {i}: {e2}")
                # Decidir si incluirla o no. Aquí optamos por descartarla:
                continue
    
    valid_count = len(valid_indices)
    if verbose:
        print(f"Secuencias totales en ds: {total}")
        print(f"Secuencias válidas (no todo NaN): {valid_count}")
        print(f"Secuencias descartadas: {total - valid_count}")
        sequences_validas = []

    for idx in valid_indices:
        try:
            sequences_validas.append(ds[idx])
        except Exception as e:
            if verbose:
                print(f"Error al extraer ds[{idx}] después de filtrar: {e}")
            # Podrías decidir saltar o detener. Aquí solo saltamos.
    return sequences_validas

def create_windows_from_sequences(sequences, window_size=15, horizon=1):
    """
    Dada una lista de secuencias (numpy arrays 1D), crea ventanas deslizantes:
    - X: array de shape (num_samples, window_size, 1)
    - y: array de shape (num_samples,)
    Cada muestra usa window_size pasos para predecir el siguiente valor (horizon=1).
    """
    X_list = []
    y_list = []
    for seq in sequences:
        # Asegurar numpy array
        arr = np.array(seq).astype(float)
        T = arr.shape[0]
        # Solo si la longitud es mayor que window_size + horizon - 1
        if T >= window_size + horizon:
            for start in range(0, T - window_size - horizon + 1):
                window = arr[start:start+window_size]
                target = arr[start+window_size:start+window_size+horizon]
                # Para horizon=1, target es un array de longitud 1; tomamos el escalar
                X_list.append(window.reshape(window_size, 1))
                y_list.append(target[0] if horizon == 1 else target)
    if len(X_list) == 0:
        return np.empty((0, window_size, 1)), np.empty((0,))
    X = np.stack(X_list, axis=0)
    y = np.array(y_list)

    # Supongamos X tiene forma (N, window_size, 1), y y forma (N,)
    mask_valid = ~np.isnan(X).any(axis=(1,2)) & ~np.isnan(y)
    # Mantener solo muestras sin NaN:
    X_clean = X[mask_valid]
    y_clean = y[mask_valid]
    print("De", X.shape[0], "muestras, quedan", X_clean.shape[0], "sin NaN")

    return X_clean, y_clean

In [3]:
ds = load_data_clean()

X, y = create_windows_from_sequences(ds, window_size=15, horizon=1)

Secuencias totales en ds: 1752
Secuencias válidas (no todo NaN): 1752
Secuencias descartadas: 0
De 433317 muestras, quedan 433317 sin NaN


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

### LSTM - 100 %

In [None]:
## Multi head training 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

def build_lstm_model(window_size=15, n_features=1, lstm_units=50):
    model = Sequential([
        LSTM(lstm_units, input_shape=(window_size, n_features)),
        Dense(1)  # para predicción de un valor escalar siguiente
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Suponiendo que X_train, y_train, X_val, y_val están listos:
model = build_lstm_model(window_size=15, n_features=1, lstm_units=50)
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[es]
)



Epoch 1/20


  super().__init__(**kwargs)


[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 5ms/step - loss: 0.3021 - val_loss: 0.2583
Epoch 2/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 6ms/step - loss: 0.2623 - val_loss: 0.2557
Epoch 3/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 6ms/step - loss: 0.2592 - val_loss: 0.2489
Epoch 4/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 6ms/step - loss: 0.2534 - val_loss: 0.2487
Epoch 5/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 6ms/step - loss: 0.2507 - val_loss: 0.2456
Epoch 6/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 6ms/step - loss: 0.2504 - val_loss: 0.2453
Epoch 7/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 6ms/step - loss: 0.2549 - val_loss: 0.2439
Epoch 8/20
[1m10833/10833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 6ms/step - loss: 0.2496 - val_loss: 0.2445
Epoch 9/20


In [8]:
# Evaluar en conjunto de validación

y_pred = model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
#smape_value = smape_new(y_val, y_pred)

print("MAE:", mae)
print("MSE:", mse)
#print("SMAPE:", smape_value)

[1m2709/2709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step
MAE: 0.2671545822468988
MSE: 0.24234376171207261


In [9]:
smape_val = smape_chunked(y_val, y_pred, chunk_size=500_000)
print("SMAPE:", smape_val)

SMAPE: 55.25035085214161


In [None]:
# Guardar el modelo entrenado
model.save('Models_lstm/lstm_healthcare_model_100.keras')

### LSTM - 10 %



In [12]:
# function to take only a percentage of the data

def sample_fraction(X, y, fraction, random_state=None, min_samples=1):
    """
    Muestra aleatoriamente una fracción de los datos (X, y).
    
    Args:
        X (np.ndarray): Array de entrada de forma (N, ..., ...), donde N es el número de muestras.
        y (np.ndarray): Array de etiquetas de forma (N, ...) correspondiente a X.
        fraction (float): Fracción de datos a tomar. Debe estar en (0, 1]. Por ejemplo, 0.10 para 10% o 0.03 para 3%.
        random_state (int o None): Semilla para reproducibilidad. Si None, aleatorio.
        min_samples (int): Número mínimo de muestras a retornar si fraction*N < min_samples. Por defecto 1.
    
    Retorna:
        X_sample (np.ndarray): Subconjunto muestreado de X de tamaño aproximadamente floor(N * fraction) o al menos min_samples.
        y_sample (np.ndarray): Subconjunto muestreado de y correspondiente.
    """
    X = np.asarray(X)
    y = np.asarray(y)
    assert X.shape[0] == y.shape[0], f"X e y deben tener el mismo número de muestras en la dimensión 0: {X.shape[0]} vs {y.shape[0]}"
    assert 0 < fraction <= 1, f"fraction debe estar en (0, 1], se recibió: {fraction}"
    
    N = X.shape[0]
    # Calcular tamaño de la muestra
    sample_size = int(np.floor(N * fraction))
    # Asegurar al menos min_samples si sample_size es menor
    sample_size = max(sample_size, min_samples) if N > 0 else 0
    sample_size = min(sample_size, N)  # no exceder N
    
    # Generar índices aleatorios sin reemplazo
    rng = np.random.default_rng(random_state)
    indices = rng.choice(N, size=sample_size, replace=False)
    
    return X[indices], y[indices]

X_10, y_10 = sample_fraction(X, y, 0.10, random_state=42)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_10, y_10, test_size=0.2, shuffle=True, random_state=42)

In [None]:
# Suponiendo que X_train, y_train, X_val, y_val están listos:
model = build_lstm_model(window_size=15, n_features=1, lstm_units=50)
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[es]
)

Epoch 1/20


  super().__init__(**kwargs)


[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.3657 - val_loss: 0.2746
Epoch 2/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.2964 - val_loss: 0.2593
Epoch 3/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.2817 - val_loss: 0.2501
Epoch 4/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.2538 - val_loss: 0.2524
Epoch 5/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.2723 - val_loss: 0.2501
Epoch 6/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.2532 - val_loss: 0.2478
Epoch 7/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.2554 - val_loss: 0.2491
Epoch 8/20
[1m1084/1084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - loss: 0.2509 - val_loss: 0.2480
Epoch 9/20
[1m1084/1084[0m [32m━

In [16]:
# Evaluar en conjunto de validación

y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
smape_val = smape_chunked(y_val, y_pred, chunk_size=500_000)

print("MAE:", mae)
print("MSE:", mse)
print("SMAPE:", smape_val)

[1m271/271[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
MAE: 0.28106132670530165
MSE: 0.24634181455920554
SMAPE: 59.27840290397485


In [17]:
# Guardar el modelo entrenado
model.save('Models_lstm/lstm_healthcare_model_10.keras')

### LSTM - 3 %

In [18]:
X_3, y_3   = sample_fraction(X, y, 0.03, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_3, y_3, test_size=0.2, shuffle=True, random_state=42)

In [19]:
# Suponiendo que X_train, y_train, X_val, y_val están listos:
model = build_lstm_model(window_size=15, n_features=1, lstm_units=50)
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    callbacks=[es]
)

Epoch 1/20


  super().__init__(**kwargs)


[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.4545 - val_loss: 0.3020
Epoch 2/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.3038 - val_loss: 0.3009
Epoch 3/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.2906 - val_loss: 0.2900
Epoch 4/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.2798 - val_loss: 0.2832
Epoch 5/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.2763 - val_loss: 0.2773
Epoch 6/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.2563 - val_loss: 0.2736
Epoch 7/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.2638 - val_loss: 0.2709
Epoch 8/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.2461 - val_loss: 0.2654
Epoch 9/20
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━

In [20]:
# Evaluar en conjunto de validación

y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
smape_val = smape_chunked(y_val, y_pred, chunk_size=500_000)

print("MAE:", mae)
print("MSE:", mse)
print("SMAPE:", smape_val)

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
MAE: 0.28537223200023826
MSE: 0.2600540885694726
SMAPE: 59.34711632361779


In [21]:
# Guardar el modelo entrenado
model.save('Models_lstm/lstm_healthcare_model_3.keras')

### LSTM Distillation 