In [None]:
from google.colab import files

uploaded = files.upload()

import pandas as pd

df = pd.read_csv(next(iter(uploaded)))

print(df)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

required_cols = ["año", "dia", "temp_min_2m"]
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Falta la columna obligatoria: {c}")

df["año"] = pd.to_numeric(df["año"], errors="coerce")
df["dia"] = pd.to_numeric(df["dia"], errors="coerce")
df = df.dropna(subset=["año", "dia"]).copy()

df["año"] = df["año"].astype(int)
df["dia"] = df["dia"].astype(int)

df = df[df["dia"].between(1, 366)].copy()

df["fecha"] = pd.to_datetime(df["año"], format="%Y") + pd.to_timedelta(df["dia"] - 1, unit="D")

df = df.sort_values("fecha").reset_index(drop=True)

df["dia_sin"] = np.sin(2 * np.pi * df["dia"] / 365.25)
df["dia_cos"] = np.cos(2 * np.pi * df["dia"] / 365.25)

num_cols = [
    "temp_2m","temp_max_2m","temp_min_2m",
    "punto_rocio_2m","humedad_rel_2m","humedad_suelo",
    "viento_2m","viento_min_2m","viento_max_2m",
    "viento_10m","viento_min_10m","viento_max_10m",
    "precipitacion","latitud","longitud","altitud"
]

for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["temp_min_2m", "fecha"]).reset_index(drop=True)

physical_ranges = {
    "temp_min_2m": (-30, 25),
    "temp_max_2m": (-10, 40),
    "temp_2m": (-20, 30),
    "punto_rocio_2m": (-30, 25),
    "humedad_rel_2m": (0, 100),
    "humedad_suelo": (0, 1),
    "precipitacion": (0, 500),
    "viento_2m": (0, 60),
    "viento_min_2m": (0, 60),
    "viento_max_2m": (0, 60),
    "viento_10m": (0, 60),
    "viento_min_10m": (0, 60),
    "viento_max_10m": (0, 60),
}

df_clean = df.copy()

for var, (vmin, vmax) in physical_ranges.items():
    if var in df_clean.columns:
        mask = (df_clean[var] < vmin) | (df_clean[var] > vmax)
        df_clean.loc[mask, var] = np.nan

df_clean["mes"] = df_clean["fecha"].dt.month

to_impute = [v for v in physical_ranges.keys() if v in df_clean.columns]
for var in to_impute:
    df_clean[var] = df_clean.groupby("mes")[var].transform(lambda x: x.fillna(x.median()))
    df_clean[var] = df_clean[var].fillna(df_clean[var].median())

iqr_vars = [
    "temp_2m","temp_max_2m","temp_min_2m",
    "punto_rocio_2m","humedad_rel_2m","humedad_suelo",
    "viento_2m","viento_min_2m","viento_max_2m",
    "viento_10m","viento_min_10m","viento_max_10m",
    "precipitacion"
]
iqr_vars = [v for v in iqr_vars if v in df_clean.columns]

for var in iqr_vars:
    s = df_clean[var].dropna()
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df_clean[var] = df_clean[var].clip(lower, upper)

df_clean["helada"] = (df_clean["temp_min_2m"] < 0).astype(int)

for var in iqr_vars:
    plt.figure()
    plt.hist(df_clean[var], bins=60)
    plt.show()

counts = df_clean["helada"].value_counts().sort_index()
plt.figure()
plt.bar(["No helada", "Helada"], [counts.get(0,0), counts.get(1,0)])
plt.show()

temp_by_month = df_clean.groupby("mes")["temp_min_2m"].mean()
plt.figure()
plt.plot(temp_by_month.index, temp_by_month.values, marker="o")
plt.show()

frost_by_month = df_clean.groupby("mes")["helada"].mean() * 100
plt.figure()
plt.bar(frost_by_month.index, frost_by_month.values)
plt.show()

doy_mean = df_clean.groupby("dia")["temp_min_2m"].mean()
plt.figure()
plt.plot(doy_mean.index, doy_mean.values)
plt.show()

for var in ["temp_min_2m","temp_max_2m","humedad_rel_2m","viento_2m","precipitacion"]:
    if var in df_clean.columns:
        plt.figure()
        plt.boxplot(df_clean[var].dropna(), vert=False)
        plt.show()

if "altitud" in df_clean.columns and df_clean["altitud"].notna().any():
    plt.figure()
    plt.scatter(df_clean["altitud"], df_clean["temp_min_2m"], s=5, alpha=0.3)
    plt.show()

for v in ["temp_2m","temp_max_2m","humedad_rel_2m","punto_rocio_2m","viento_2m","precipitacion"]:
    if v in df_clean.columns:
        plt.figure()
        plt.scatter(df_clean[v], df_clean["temp_min_2m"], s=5, alpha=0.3)
        plt.show()

corr_cols = [c for c in num_cols if c in df_clean.columns]
corr = df_clean[corr_cols].dropna().corr()

plt.figure(figsize=(12,10))
im = plt.imshow(corr.values, aspect="auto")
plt.colorbar(im, fraction=0.046, pad=0.04)

plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)

for i in range(len(corr.columns)):
    for j in range(len(corr.columns)):
        value = corr.values[i, j]
        plt.text(
            j, i,
            f"{value:.2f}",
            ha="center",
            va="center",
            color="white" if abs(value) > 0.5 else "black",
            fontsize=8
        )

plt.tight_layout()
plt.show()

In [None]:
# RandomForestRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = RandomForestRegressor(
        n_estimators=300,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# DecisionTreeRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = DecisionTreeRegressor(
        max_depth=12,
        min_samples_leaf=10,
        min_samples_split=20,
        random_state=42
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# XGBRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = XGBRegressor(
        objective="reg:squarederror",
        n_estimators=800,
        learning_rate=0.03,
        max_depth=5,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        random_state=42,
        n_jobs=-1
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# LGBMRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        num_leaves=31,
        max_depth=-1,
        min_child_samples=30,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=0.0,
        random_state=42,
        n_jobs=-1
    )

    t0 = time.time()
    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric="l1",
        callbacks=[]
    )
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
        "Best_iter": getattr(model, "best_iteration_", None)
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# SVR

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("svr", SVR(
            kernel="rbf",
            C=10.0,
            epsilon=0.2,
            gamma="scale"
        ))
    ])

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# MLPRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("mlp", MLPRegressor(
            hidden_layer_sizes=(64, 32),
            activation="relu",
            solver="adam",
            alpha=1e-4,
            learning_rate_init=1e-3,
            max_iter=300,
            early_stopping=True,
            validation_fraction=0.15,
            n_iter_no_change=20,
            random_state=42
        ))
    ])

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# Linear Regression

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}
coefs_by_h = {}

ALPHA = 1.0

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("ridge", Ridge(alpha=ALPHA, random_state=42))
    ])

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
        "alpha": ALPHA
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

    ridge_model = model.named_steps["ridge"]
    coefs_by_h[H] = pd.Series(
        ridge_model.coef_, index=feature_cols
    ).sort_values(key=np.abs, ascending=False)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    print(coefs_by_h[H].head(10))

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# GradientBoostingRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}
importances_by_h = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = GradientBoostingRegressor(
        n_estimators=600,
        learning_rate=0.03,
        max_depth=3,
        subsample=0.8,
        random_state=42
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

    importances_by_h[H] = pd.Series(
        model.feature_importances_, index=feature_cols
    ).sort_values(ascending=False)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    print(importances_by_h[H].head(10))

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# KNeighborsRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

K = 25

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsRegressor(
            n_neighbors=K,
            weights="distance",
            metric="minkowski",
            p=2
        ))
    ])

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
        "k": K
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# AdaBoostRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    base_tree = DecisionTreeRegressor(
        max_depth=3,
        min_samples_leaf=10,
        random_state=42
    )

    model = AdaBoostRegressor(
        estimator=base_tree,
        n_estimators=400,
        learning_rate=0.05,
        loss="linear",
        random_state=42
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# ExtraTreesRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LAT = -12.6679
TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m",
    "humedad_rel_2m",
    "humedad_suelo",
    "precipitacion",
    "viento_2m",
    "altitud",
    "dia_sin",
    "dia_cos",
    "temp_min_2m",
    "temp_min_lag_1",
    "temp_min_lag_2",
    "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}
importances_by_h = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    model = ExtraTreesRegressor(
        n_estimators=600,
        max_depth=None,
        min_samples_leaf=5,
        min_samples_split=10,
        max_features="sqrt",
        bootstrap=False,
        random_state=42,
        n_jobs=-1
    )

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred)

    importances_by_h[H] = pd.Series(
        model.feature_importances_, index=feature_cols
    ).sort_values(ascending=False)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    print(importances_by_h[H].head(10))

for H in [1, 2, 3]:
    fechas_test, y_test_vals, y_pred_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas_test, y_test_vals)
    plt.plot(fechas_test, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()


In [None]:
# GaussianProcessRegressor

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

START_DATE = "2000-01-01"
END_DATE = "2025-12-31"
CUTOFF_DATE = "2019-01-01"

FROST_THRESHOLD = 0.0

df_model = df_clean.copy()
df_model["fecha"] = pd.to_datetime(df_model["fecha"])

df_model = df_model[
    (df_model["fecha"] >= START_DATE) &
    (df_model["fecha"] <= END_DATE)
].copy()

TARGET_LON = -75.3245
TARGET_ALT = 4226.68

LAT_TOL = 0.005
LON_TOL = 0.005
ALT_TOL = 50

df_model = df_model[
    (df_model["latitud"].between(TARGET_LAT - LAT_TOL, TARGET_LAT + LAT_TOL)) &
    (df_model["longitud"].between(TARGET_LON - LON_TOL, TARGET_LON + LON_TOL)) &
    (df_model["altitud"].between(TARGET_ALT - ALT_TOL, TARGET_ALT + ALT_TOL))
].copy()

for col in ["dia_sin", "dia_cos"]:
    if col not in df_model.columns:
        raise ValueError(f"Falta la columna {col}")

for lag in [1, 2, 3]:
    df_model[f"temp_min_lag_{lag}"] = df_model["temp_min_2m"].shift(lag)

feature_cols = [
    "temp_2m", "humedad_rel_2m", "humedad_suelo", "precipitacion",
    "viento_2m", "altitud", "dia_sin", "dia_cos",
    "temp_min_2m", "temp_min_lag_1", "temp_min_lag_2", "temp_min_lag_3",
]

cutoff = pd.Timestamp(CUTOFF_DATE)

rows = []
pred_series = {}

for H in [1, 2, 3]:
    df_h = df_model.copy()
    df_h["y"] = df_h["temp_min_2m"].shift(-H)
    df_h = df_h.dropna().reset_index(drop=True)

    X = df_h[feature_cols]
    y = df_h["y"]

    train_mask = df_h["fecha"] < cutoff
    X_train, X_test = X[train_mask], X[~train_mask]
    y_train, y_test = y[train_mask], y[~train_mask]
    fechas_test = df_h.loc[~train_mask, "fecha"].values

    kernel = 1.0 * RBF(length_scale=1.0) + WhiteKernel(noise_level=0.5)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("gpr", GaussianProcessRegressor(
            kernel=kernel,
            alpha=1e-6,
            normalize_y=True,
            random_state=42
        ))
    ])

    t0 = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - t0

    y_pred, y_std = model.predict(X_test, return_std=True)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    smape = np.mean(
        2 * np.abs(y_test - y_pred) /
        (np.abs(y_test) + np.abs(y_pred) + 1e-6)
    ) * 100

    rows.append({
        "Horizonte": f"t+{H}",
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "sMAPE_%": smape,
        "Tiempo_s": train_time,
        "N_train": len(X_train),
        "N_test": len(X_test),
    })

    pred_series[H] = (fechas_test, y_test.values, y_pred, y_std)

summary = pd.DataFrame(rows).sort_values("Horizonte")
pd.set_option("display.max_columns", None)
print(summary)

for H in [1, 2, 3]:
    fechas, y_true_vals, y_pred_vals, y_std_vals = pred_series[H]

    plt.figure(figsize=(11, 4))
    plt.plot(fechas, y_true_vals)
    plt.plot(fechas, y_pred_vals)
    plt.axhline(FROST_THRESHOLD, linestyle="--")
    plt.xlabel("Fecha")
    plt.ylabel("°C")
    plt.tight_layout(pad=2.5)
    plt.show()
