# series_de_tiempo_auto

In [19]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import grangercausalitytests 
from itertools import product
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.statespace.sarimax import SARIMAX
from scipy.stats import variation, boxcox, yeojohnson, linregress, chi2  
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from statsmodels.tsa.stattools import arma_order_select_ic

In [21]:
def inv_yj(y_trans, lmbda):
    return np.power(y_trans * lmbda + 1, 1 / lmbda) - 1 if lmbda != 0 else np.exp(y_trans) - 1

In [12]:
data=pd.read_parquet('df_consol.parquet', engine='pyarrow').reset_index()  
data=data.set_index('FECHA').sort_index()   
df_train=data.iloc[:-300]
df_test=data.iloc[-300:]  
y_train=df_train[['oro']].iloc[:-120] 
transformacion, lambda_opt = yeojohnson(y_train['oro'])
df_train['oro_yj'] = yeojohnson(df_train['oro'], lmbda=lambda_opt)   
df_train['oro_yj_diff'] = df_train['oro_yj'].diff()
df_final_train = df_train.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['oro_yj'] = yeojohnson(df_train['oro'], lmbda=lambda_opt)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['oro_yj_diff'] = df_train['oro_yj'].diff()


In [13]:
df_granger = df_final_train.copy().dropna()
target= 'oro_yj_diff' 
max_lags = 10
resultados = []
X = df_granger.drop(columns=[target]) 
print(f"Calculando ranking de variables para: {target}...")

for col in X.columns:
    test = grangercausalitytests(df_granger[[target, col]], maxlag=max_lags, verbose=False)
    # Buscar el mejor lag (el que tenga mayor F-Score)
    best_f = 0
    best_p = 1
    best_lag = 0
        
    for i in range(1, max_lags + 1):
        f_score = test[i][0]['ssr_ftest'][0]
        p_val   = test[i][0]['ssr_ftest'][1]
            
        if f_score > best_f:
            best_f = f_score
            best_p = p_val
            best_lag = i
        
        # Guardar resultado
    resultados.append({
        'Variable': col,
        'Mejor_Lag': best_lag,
        'F_Score': best_f,
        'P_Value': best_p
        })

# Crear tabla y ordenar
df_ranking = pd.DataFrame(resultados)
df_ranking = df_ranking.sort_values(by='F_Score', ascending=False).reset_index(drop=True)

# Filtrar solo las útiles (p < 0.05)
variables_utiles = df_ranking[df_ranking['P_Value'] < 0.05]['Variable'].tolist()
variables_utiles.remove('oro')
print("\nRanking de Importancia:")
display(df_ranking.head(10))

print("\nLista de variables:")
print(variables_utiles) 

Calculando ranking de variables para: oro_yj_diff...





Ranking de Importancia:




Unnamed: 0,Variable,Mejor_Lag,F_Score,P_Value
0,aem,2,66.915379,2.327956e-29
1,nem,2,53.120562,1.5878820000000003e-23
2,kgc,2,51.623533,6.855435000000001e-23
3,wpm,2,49.022339,8.72492e-22
4,paas,2,41.044001,2.1729560000000002e-18
5,ag,2,22.99391,1.161615e-10
6,tasa_5y,2,14.616161,4.709962e-07
7,tasa_10y,2,13.651256,1.228649e-06
8,fcx,2,10.309746,3.410861e-05
9,dxy_fut,2,8.670285,0.0001745229



Lista de variables:
['aem', 'nem', 'kgc', 'wpm', 'paas', 'ag', 'tasa_5y', 'tasa_10y', 'fcx', 'dxy_fut', 'bhp', 'tasa_2y', 'desempleo_ita', 'shel', 'vix', 'emergentes', 'tsm', 'amd', 'desempleo_usa', 'unh', 'desempleo_can', 'nvda', 'xom', 'petroleo_brent', 'eqt', 'scco', 'ctra', 'spy', 'pib_gbr', 'ftse', 'sp500', 'qqq', 'dia', 'dinero_circulante_eeuu_m2_aprox_lcu', 'pib_ita', 'desempleo_mex', 'pib_fra', 'cvx', 'pib_jpn', 'pib_deu', 'pib_can', 'desempleo_ingl', 'amzn']


In [14]:
X = df_final_train[variables_utiles].dropna()  
X_temp = X.copy()
umbral = 10

while True:
    vif_vals = [variance_inflation_factor(X_temp.values, i) for i in range(X_temp.shape[1])]
    max_vif = max(vif_vals)
    
    if max_vif < umbral:
        break
    
    max_idx = vif_vals.index(max_vif)
    col_eliminar = X_temp.columns[max_idx]
    print(f"Eliminando: {col_eliminar} (VIF={max_vif:.2f})")
    X_temp = X_temp.drop(columns=col_eliminar)

print("\nVariables finales:")
print(X_temp.columns.tolist()) 

Eliminando: spy (VIF=51400.77)
Eliminando: sp500 (VIF=3607.16)
Eliminando: dinero_circulante_eeuu_m2_aprox_lcu (VIF=1449.31)
Eliminando: pib_fra (VIF=93155.15)
Eliminando: pib_deu (VIF=56402.99)
Eliminando: pib_can (VIF=29705.04)
Eliminando: pib_jpn (VIF=23632.63)
Eliminando: pib_gbr (VIF=11731.24)
Eliminando: pib_ita (VIF=3628.79)
Eliminando: dia (VIF=2329.79)
Eliminando: ftse (VIF=1502.59)
Eliminando: qqq (VIF=1287.97)
Eliminando: tasa_5y (VIF=1276.60)
Eliminando: dxy_fut (VIF=781.96)
Eliminando: cvx (VIF=705.79)
Eliminando: desempleo_can (VIF=629.49)
Eliminando: emergentes (VIF=523.10)
Eliminando: desempleo_mex (VIF=466.07)
Eliminando: desempleo_ingl (VIF=359.93)
Eliminando: xom (VIF=318.94)
Eliminando: bhp (VIF=243.29)
Eliminando: tsm (VIF=160.17)
Eliminando: shel (VIF=144.00)
Eliminando: aem (VIF=135.86)
Eliminando: tasa_10y (VIF=126.10)
Eliminando: scco (VIF=105.89)
Eliminando: ctra (VIF=104.25)
Eliminando: nem (VIF=100.51)
Eliminando: wpm (VIF=88.47)
Eliminando: amzn (VIF=66.13)

## 1) Armado TRAIN/TEST (sin bfill)

In [16]:
# TRAIN 
cols_finales = list(X_temp.columns)
x = df_final_train[cols_finales].asfreq("B").ffill()
y_yj  = df_final_train["oro_yj"].asfreq("B").ffill()
y_usd = df_final_train["oro"].asfreq("B").ffill()

# TEST
df_test_proc = df_test.asfreq("B").ffill()

y_test_yj_values = yeojohnson(df_test_proc["oro"], lmbda=lambda_opt)
y_test_yj = pd.Series(y_test_yj_values, index=df_test_proc.index)

X_test_proc = df_test_proc.drop(columns=["oro"], errors="ignore")
X_test_proc = X_test_proc[cols_finales].asfreq("B").ffill()

# si al inicio del test faltan exógenas, rellena con el último del train (sin fuga)
X_test_proc = X_test_proc.fillna(x.iloc[-1])

print("NaNs en x:", int(x.isna().sum().sum()))
print("NaNs en X_test_proc:", int(X_test_proc.isna().sum().sum()))


NaNs en x: 0
NaNs en X_test_proc: 0


## 2) Auto-SARIMAX 


In [22]:
h = 10
delay = 1
s = 5  
d = 1
D = 0

tscv = TimeSeriesSplit(n_splits=12, test_size=h)

registros = []

for fold, (train_idx, test_idx) in enumerate(tscv.split(y_yj), 1):

    X_train_w = x.iloc[train_idx]
    y_train_w = y_yj.iloc[train_idx]

    X_train_shifted = X_train_w.shift(delay).dropna()
    y_train_aligned = y_train_w.loc[X_train_shifted.index]

    if X_train_shifted.empty or y_train_aligned.empty:
        continue

    sel = arma_order_select_ic(y_train_aligned.dropna(), max_ar=2, max_ma=2, ic="aic")
    p, q = sel.aic_min_order

    # 2) Elegir (P,Q) mirando la serie con diferencia estacional (aprox)
    y_seas = y_train_aligned.diff(s).dropna()
    if len(y_seas) > 20:
        sel_s = arma_order_select_ic(y_seas, max_ar=2, max_ma=2, ic="aic")
        P, Q = sel_s.aic_min_order
    else:
        P, Q = 0, 0

    order = (p, d, q)
    seasonal_order = (P, D, Q, s)

    # fit SARIMAX
    res = SARIMAX(
        y_train_aligned,
        exog=X_train_shifted,
        order=order,
        seasonal_order=seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(disp=False)

    ultimo = X_train_w.iloc[-delay]
    exog_futura = pd.DataFrame(
        np.repeat(ultimo.values.reshape(1, -1), len(test_idx), axis=0),
        index=y_yj.index[test_idx],
        columns=x.columns
    )

    fc_yj = res.forecast(steps=len(test_idx), exog=exog_futura)
    fc_usd = inv_yj(fc_yj, lambda_opt)

    real_usd = y_usd.iloc[test_idx].values
    mae = float(np.mean(np.abs(real_usd - fc_usd)))

    registros.append({
        "fold": fold,
        "order": order,
        "seasonal_order": seasonal_order,
        "aic": float(res.aic),
        "mae": mae
    })

    print(f"Fold {fold}: order={order}, seasonal={seasonal_order} | AIC={res.aic:.2f} | MAE={mae:.4f}")

df_cv = pd.DataFrame(registros)
df_cv.head()

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 1: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=18938.41 | MAE=50.3679


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 2: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=18986.48 | MAE=50.6480


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


Fold 3: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19034.02 | MAE=43.6651


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


Fold 4: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19095.80 | MAE=35.4034


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 5: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19142.40 | MAE=35.6380


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


Fold 6: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19198.72 | MAE=18.4281


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 7: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19258.41 | MAE=17.3585


  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'


Fold 8: order=(np.int64(2), 1, np.int64(2)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19309.62 | MAE=48.3617


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 9: order=(np.int64(2), 1, np.int64(2)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19354.67 | MAE=45.5460


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 10: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19415.07 | MAE=21.1985


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 11: order=(np.int64(2), 1, np.int64(2)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19465.46 | MAE=86.8453


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Fold 12: order=(np.int64(2), 1, np.int64(1)), seasonal=(np.int64(2), 0, np.int64(2), 5) | AIC=19524.52 | MAE=14.4111


Unnamed: 0,fold,order,seasonal_order,aic,mae
0,1,"(2, 1, 1)","(2, 0, 2, 5)",18938.406305,50.367868
1,2,"(2, 1, 1)","(2, 0, 2, 5)",18986.480997,50.647976
2,3,"(2, 1, 1)","(2, 0, 2, 5)",19034.019294,43.665076
3,4,"(2, 1, 1)","(2, 0, 2, 5)",19095.796717,35.403438
4,5,"(2, 1, 1)","(2, 0, 2, 5)",19142.403262,35.637962


In [31]:
criterio = "mae"   

best_combo = df_cv.groupby(["order", "seasonal_order"])[criterio].mean().idxmin()
best_order, best_seasonal = best_combo

print("BEST FINAL", criterio, "promedio:", best_order, best_seasonal)

BEST FINAL mae promedio: (np.int64(2), 1, np.int64(1)) (np.int64(2), 0, np.int64(2), 5)


## 3) TEST final (rolling origin + expanding train) usando `best_order`

In [32]:
# Total (train + test)
y_total = pd.concat([y_yj, y_test_yj]).asfreq("B").ffill()
X_total = pd.concat([x, X_test_proc]).asfreq("B").ffill()
X_total = X_total.loc[y_total.index]

# posiciones del tramo test dentro del total
test_pos = y_total.index.get_indexer(y_test_yj.index)
test_pos = test_pos[test_pos >= 0]
n_windows = len(test_pos) // h

preds_test_usd, real_test_usd = [], []
preds_test_yj, fechas_test = [], []

for w in range(n_windows):
    block_pos = test_pos[w*h:(w+1)*h]
    fechas_ventana = y_total.index[block_pos]

    train_end_pos = block_pos[0]
    y_train_w = y_total.iloc[:train_end_pos]
    X_train_w = X_total.iloc[:train_end_pos]

    # lag sin fuga
    X_train_shifted = X_train_w.shift(delay).dropna()
    y_train_aligned = y_train_w.loc[X_train_shifted.index]

    if X_train_shifted.empty or y_train_aligned.empty:
        continue

    res = SARIMAX(
        y_train_aligned,
        exog=X_train_shifted,
        order=best_order,
        seasonal_order=best_seasonal,  
        enforce_stationarity=False,
        enforce_invertibility=False
    ).fit(disp=False)

    #EXOGENAS YA OCUPADAS
    ultimo_dato_conocido = X_train_shifted.iloc[-1]
    exog_futura = pd.DataFrame(
        np.repeat(ultimo_dato_conocido.values.reshape(1, -1), len(block_pos), axis=0),
        index=fechas_ventana,
        columns=X_total.columns
    )

    # Forecast
    fc_yj = res.forecast(steps=len(block_pos), exog=exog_futura)
    fc_usd = inv_yj(fc_yj, lambda_opt)

    # Real en USD
    real_val = df_test_proc.loc[fechas_ventana, "oro"].values

    preds_test_yj.extend(fc_yj)
    preds_test_usd.extend(fc_usd)
    real_test_usd.extend(real_val)
    fechas_test.extend(fechas_ventana)

df_res_test = pd.DataFrame({
    "Real_USD": real_test_usd,
    "Pred_USD": preds_test_usd,
    "Pred_YJ":  preds_test_yj,
}, index=pd.DatetimeIndex(fechas_test))

df_res_test["Error_Abs"] = np.abs(df_res_test["Real_USD"] - df_res_test["Pred_USD"])
df_res_test.head(), df_res_test.tail()



(               Real_USD     Pred_USD     Pred_YJ  Error_Abs
 2024-09-05  2511.399902  2492.917804  430.097667  18.482099
 2024-09-06  2493.500000  2491.843075  429.960701   1.656925
 2024-09-09  2501.800049  2491.746992  429.948455  10.053056
 2024-09-10  2512.300049  2491.774766  429.951995  20.525283
 2024-09-11  2512.100098  2492.198264  430.005969  19.901834,
                Real_USD     Pred_USD     Pred_YJ   Error_Abs
 2025-10-23  4125.500000  4172.118762  629.056166   46.618762
 2025-10-24  4118.399902  4172.205946  629.065867   53.806043
 2025-10-27  4001.899902  4172.412430  629.088844  170.512528
 2025-10-28  3993.100098  4172.447264  629.092720  179.347166
 2025-10-29  3993.100098  4172.544300  629.103517  179.444202)

## 4) Métricas rápidas en TEST

In [33]:
mae_test = float(np.mean(np.abs(df_res_test["Real_USD"] - df_res_test["Pred_USD"])))
rmse_test = float(np.sqrt(np.mean((df_res_test["Real_USD"] - df_res_test["Pred_USD"])**2)))

print("MAE TEST:", mae_test)
print("RMSE TEST:", rmse_test)


MAE TEST: 1938998.8836602871
RMSE TEST: 32985638.901100416
