In [None]:
import warnings
import numpy as np
import pandas as pd
import joblib
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from itertools import product
import psutil



warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------
# CONFIGURACIÓN
# -----------------------------------------------------------------------------
RUTA_DATOS = '/home/cony/Documentos/Cony/trabajo_final/para_subir/datos/datos_sla_y_era.csv'
STEP_DIFF = 1440
LAGS = 0
RUTA_BEST = 'mlp_diff_best.pkl'

# -----------------------------------------------------------------------------
# CARGA Y DIFERENCIACIÓN
# -----------------------------------------------------------------------------
df_raw = (
    pd.read_csv(RUTA_DATOS, parse_dates=['date'])
      .set_index('date')
      .asfreq('1min')
      .interpolate('time')
      .dropna()
)

# Diferenciamos ambas columnas con el mismo paso
df_diff = pd.DataFrame(index=df_raw.index)
df_diff['delta_obs'] = df_raw['Temp'].diff(STEP_DIFF)   # Target
df_diff['delta_era'] = df_raw['t2m'].diff(STEP_DIFF)      # Entrada
df_diff = df_diff.dropna()

COLUMNA_OBJ = 'delta_obs'   # ya no es Temp, es la diferencia
EXOG_COL    = 'delta_era'   # ahora es la diferencia de ERA5

# -----------------------------------------------------------------------------
# FUNCIÓN AUXILIAR – CREAR DATASET
# -----------------------------------------------------------------------------
def make_dataset(df, lags):
    X, y = [], []
    for i in range(lags, len(df)):
        row = df[COLUMNA_OBJ].iloc[i - lags:i].tolist()
        row.append(df[EXOG_COL].iloc[i])
        X.append(row)
        y.append(df[COLUMNA_OBJ].iloc[i])
    return np.array(X), np.array(y)

# -----------------------------------------------------------------------------
# CORTE TRAIN / VAL / TEST
# -----------------------------------------------------------------------------
TOTAL = len(df_diff)
tr_end = int(0.70 * TOTAL)
val_end = int(0.85 * TOTAL)

train_df = df_diff.iloc[:tr_end]
val_df   = df_diff.iloc[tr_end:val_end]
test_df  = df_diff.iloc[val_end:]

X_train, y_train = make_dataset(train_df, LAGS)
X_val,   y_val   = make_dataset(val_df,   LAGS)
X_test,  y_test  = make_dataset(test_df,  LAGS)

print(f"Total: {TOTAL} | Train: {len(y_train)} | Val: {len(y_val)} | Test: {len(y_test)}")



Total: 764681 | Train: 535276 | Val: 114702 | Test: 114703


In [2]:

# GRID 
# -----------------------------------------------------------------------------
grid = {
    'hidden_layer_sizes': [(4,), (8,), (8, 4), (8, 8)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [1e-2, 1e-1, 1],         # defecto 1e-4, mayor alpha menor riesgo de sobreajuste
    'learning_rate_init': [1e-3, 1e-2],
    'max_iter': [80, 100],
    'early_stopping': [True],
    'n_iter_no_change': [7],
}


best_val_rmse = np.inf
best_model = None
best_params = None


# -----------------------------------------------------------------------------
print("\nIniciando grid-search sobre validación...")
combinaciones = list(product(
    grid['hidden_layer_sizes'],
    grid['activation'],
    grid['alpha'],
    grid['learning_rate_init'],
    grid['max_iter']))
print(f"Total de combinaciones: {len(combinaciones)}\n")

best_val_mse = np.inf
best_model   = None
best_params  = None

for idx, (hs, act, alpha, lr, it) in enumerate(combinaciones, 1):
    tic = pd.Timestamp.now()
    mlp = MLPRegressor(
        hidden_layer_sizes=hs,
        activation=act,
        alpha=alpha,
        learning_rate_init=lr,
        max_iter=it,
        solver='adam',
        random_state=42,
        early_stopping=True,
        n_iter_no_change=10
    )
    mlp.fit(X_train, y_train)
    val_pred = mlp.predict(X_val)
    val_mse  = mean_squared_error(y_val, val_pred)
    mem      = psutil.Process().memory_info().rss / 1024**2
    elapsed  = (pd.Timestamp.now() - tic).total_seconds()
    print(f"[{idx:>3}/{len(combinaciones)}] "
          f"{hs}-{act}-α{alpha} - lr{lr} - it{it} |  act: {act}"
          f" MSE: {val_mse:.4f} | RAM: {mem:.1f} MiB | {elapsed:.1f}s")

    if val_mse < best_val_mse:
        best_val_mse, best_model, best_params = val_mse, mlp, {
            'hidden_layer_sizes': hs,
            'activation': act,
            'alpha': alpha,
            'learning_rate_init': lr,
            'max_iter': it
        }
    train_pred = mlp.predict(X_train)
    train_mse  = mean_squared_error(y_train, train_pred)
    print(f"  → MSE train: {train_mse:.4f} | MSE val: {val_mse:.4f}")
    

joblib.dump(best_model, RUTA_BEST)
print("Mejores hiper-parámetros:", best_params)
print("MSE en val:", best_val_mse)


Iniciando grid-search sobre validación...
Total de combinaciones: 144

[  1/144] (4,)-relu-α0.01 - lr0.001 - it80 |  act: relu MSE: 7.7117 | RAM: 278.7 MiB | 26.5s
  → MSE train: 4.9582 | MSE val: 7.7117
[  2/144] (4,)-relu-α0.01 - lr0.001 - it100 |  act: relu MSE: 7.7117 | RAM: 286.6 MiB | 25.8s
  → MSE train: 4.9582 | MSE val: 7.7117
[  3/144] (4,)-relu-α0.01 - lr0.01 - it80 |  act: relu MSE: 7.7500 | RAM: 288.1 MiB | 12.0s
  → MSE train: 4.9560 | MSE val: 7.7500
[  4/144] (4,)-relu-α0.01 - lr0.01 - it100 |  act: relu MSE: 7.7500 | RAM: 288.1 MiB | 11.7s
  → MSE train: 4.9560 | MSE val: 7.7500
[  5/144] (4,)-relu-α0.1 - lr0.001 - it80 |  act: relu MSE: 7.6860 | RAM: 288.1 MiB | 22.0s
  → MSE train: 4.9594 | MSE val: 7.6860
[  6/144] (4,)-relu-α0.1 - lr0.001 - it100 |  act: relu MSE: 7.6860 | RAM: 288.1 MiB | 22.0s
  → MSE train: 4.9594 | MSE val: 7.6860
[  7/144] (4,)-relu-α0.1 - lr0.01 - it80 |  act: relu MSE: 7.6932 | RAM: 288.1 MiB | 11.8s
  → MSE train: 4.9542 | MSE val: 7.6932


In [None]:
import pandas as pd, matplotlib.pyplot as plt, sys
ruta_directorio = r'/home/cony/Documentos/GEERS/codigos_pc'
sys.path.append(ruta_directorio)
import Metrics as m

# --- funciones limpias -------------------------------------------
def plots(x, y_original, y_estimado, serie='serie'):
    plt.figure(figsize=(14,4))
    plt.plot(x, y_original, label=f'{serie} original')
    plt.plot(x, y_estimado, label=f'{serie} estimado')
    plt.xlabel('Etiqueta temporal', fontsize=15)
    plt.ylabel('T (°C)', fontsize=15)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.legend(fontsize=20)
    plt.tight_layout()
    plt.grid(True)
    plt.show()

def calculo_m(true, pred):
    rmbe  = m.rmbe(true, pred)
    rrmsd = m.rrmsd(true, pred)
    rmae  = m.rmae(true, pred)
    return rmbe, rrmsd, rmae


In [6]:
%matplotlib qt
# -----------------------------------------------------------------------------
# 1)  TEST
# -----------------------------------------------------------------------------
test_idx   = test_df.index
base_test  = df_raw['Temp'].shift(STEP_DIFF).loc[test_idx]
delta_pred = best_model.predict(X_test)

df_pred = pd.DataFrame(index=test_idx)
df_pred['Temp']      = base_test + delta_pred
df_pred['y_pred']    = df_pred['Temp']          # mismo valor, alias para tu función
df_pred['y_true']    = df_raw['Temp'].loc[test_idx]

rmbe_test, rrmsd_test, rmae_test = calculo_m(df_pred['y_true'], df_pred['y_pred'])
print("\n=== TEST (reconstruido) ===")
print(f"rMBE:  {rmbe_test:.4f}%")
print(f"rRMSD: {rrmsd_test:.4f}%")
print(f"rMAE:  {rmae_test:.4f}%")

plots(df_pred.index, df_pred['y_true'], df_pred['y_pred'], serie='Test')

# -----------------------------------------------------------------------------
# 2)  ENTRENAMIENTO
# -----------------------------------------------------------------------------
train_idx   = train_df.index
base_train  = df_raw['Temp'].shift(STEP_DIFF).loc[train_idx]
delta_pred_train = best_model.predict(X_train)

df_pred_train = pd.DataFrame(index=train_idx)
df_pred_train['Temp']      = base_train + delta_pred_train
df_pred_train['y_pred']    = df_pred_train['Temp']
df_pred_train['y_true']    = df_raw['Temp'].loc[train_idx]

rmbe_train, rrmsd_train, rmae_train = calculo_m(
    df_pred_train['y_true'], df_pred_train['y_pred'])

print("\n=== ENTRENAMIENTO (reconstruido) ===")
print(f"rMBE:  {rmbe_train:.4f}%")
print(f"rRMSD: {rrmsd_train:.4f}%")
print(f"rMAE:  {rmae_train:.4f}%")

plots(df_pred_train.index,
      df_pred_train['y_true'],
      df_pred_train['y_pred'],
      serie='Train')


=== TEST (reconstruido) ===
rMBE:  -0.2817%
rRMSD: 12.3945%
rMAE:  8.5248%

=== ENTRENAMIENTO (reconstruido) ===
rMBE:  -0.1703%
rRMSD: 10.9541%
rMAE:  8.0946%
