# Feature Engineering - Predicción USD/CLP

Este notebook documenta el proceso de ingeniería de características aplicado a los datos del tipo de cambio USD/CLP.

## Objetivos
1. Cargar datos crudos del Banco Central de Chile
2. Crear features lag (valores históricos)
3. Calcular promedios móviles
4. Agregar variables temporales
5. Implementar indicadores técnicos (ROC, Momentum)
6. Calcular features de volatilidad
7. Analizar correlaciones entre features
8. Guardar dataset procesado para modelado

In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Agregar src al path
sys.path.insert(0, '../src')

from features.build_features import (
    add_lag_features,
    add_moving_averages,
    add_temporal_features,
    add_technical_indicators,
    add_volatility_features
)

# Configuración
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)

print("Librerías cargadas correctamente")

Librerías cargadas correctamente


## 1. Carga de Datos Raw

In [2]:
df_raw = pd.read_csv('../data/raw/dolar_bcch.csv', parse_dates=['Fecha'])

print(f"Datos cargados: {len(df_raw):,} registros")
print(f"Período: {df_raw['Fecha'].min()} a {df_raw['Fecha'].max()}")
print(f"\n{'='*60}")
df_raw.head()

Datos cargados: 10,956 registros
Período: 1995-10-09 00:00:00 a 2025-10-06 00:00:00



Unnamed: 0,Fecha,Valor,statusCode
0,1995-10-09,401.54,OK
1,1995-10-10,402.76,OK
2,1995-10-11,403.89,OK
3,1995-10-12,,ND
4,1995-10-13,403.4,OK


In [3]:
missing_count = df_raw['Valor'].isna().sum()
missing_pct = (missing_count / len(df_raw)) * 100

print(f"Valores faltantes: {missing_count:,} ({missing_pct:.1f}%)")
print(f"Valores válidos: {len(df_raw) - missing_count:,} ({100-missing_pct:.1f}%)")

Valores faltantes: 3,482 (31.8%)
Valores válidos: 7,474 (68.2%)


## 2. Features Lag

In [4]:
df = df_raw.copy()
df = add_lag_features(df)

print("Features lag creadas:")
lag_cols = [c for c in df.columns if 'lag' in c.lower()]
print(lag_cols)

# Mostrar las columnas lag creadas
cols_to_show = ['Fecha', 'Valor'] + lag_cols
df[cols_to_show].dropna().head()

Features lag creadas:
['Valor_lag_1', 'Valor_lag_7', 'Valor_lag_30']


Unnamed: 0,Fecha,Valor,Valor_lag_1,Valor_lag_7,Valor_lag_30
31,1995-11-09,412.66,413.93,415.07,402.76
32,1995-11-10,412.13,412.66,416.31,403.89
37,1995-11-15,405.76,408.57,413.93,403.83
38,1995-11-16,405.33,405.76,412.66,403.11
39,1995-11-17,407.75,405.33,412.13,403.96


## 3. Moving Averages

In [5]:
df = add_moving_averages(df)

print("Moving averages creados:")
ma_cols = [c for c in df.columns if 'MA_' in c]
print(ma_cols)

# Mostrar columnas relevantes
cols_to_show = ['Fecha', 'Valor'] + ma_cols
df[cols_to_show].dropna().tail()

Moving averages creados:
['MA_7', 'MA_30', 'MA_90']


Unnamed: 0,Fecha,Valor,MA_7,MA_30,MA_90
10949,2025-09-30,961.24,956.53,960.37,960.12
10950,2025-10-01,962.39,958.44,960.22,960.72
10951,2025-10-02,959.19,959.63,959.76,961.24
10952,2025-10-03,961.54,960.65,959.14,961.25
10955,2025-10-06,963.17,961.51,958.17,962.11


## 4. Features Temporales

In [6]:
# Convertir índice temporalmente para add_temporal_features
df_temp = df.set_index('Fecha')
df_temp = add_temporal_features(df_temp)
df = df_temp.reset_index()

print("Features temporales creadas:")
temporal_cols = ['day_of_week', 'day_of_month', 'month', 'quarter', 'year', 'is_month_start', 'is_month_end']
temporal_cols = [c for c in temporal_cols if c in df.columns]
print(temporal_cols)

# Mostrar algunas columnas temporales
cols_to_show = ['Fecha', 'Valor'] + temporal_cols[:4]  # Mostrar primeras 4
df[cols_to_show].head()

Features temporales creadas:
['day_of_week', 'day_of_month', 'month', 'quarter', 'year', 'is_month_start', 'is_month_end']


Unnamed: 0,Fecha,Valor,day_of_week,day_of_month,month,quarter
0,1995-10-09,401.54,0,9,10,4
1,1995-10-10,402.76,1,10,10,4
2,1995-10-11,403.89,2,11,10,4
3,1995-10-12,,3,12,10,4
4,1995-10-13,403.4,4,13,10,4


## 5. Indicadores Técnicos

In [7]:
df = add_technical_indicators(df)

print("Indicadores técnicos creados:")
tech_cols = [c for c in df.columns if c.startswith(('ROC_', 'momentum_', 'daily_'))]
print(tech_cols)

# Mostrar columnas de indicadores técnicos
cols_to_show = ['Fecha', 'Valor'] + tech_cols[:4]  # Mostrar primeros 4
df[cols_to_show].dropna().tail()

Indicadores técnicos creados:
['ROC_7', 'ROC_30', 'momentum_7', 'momentum_30', 'daily_return']


Unnamed: 0,Fecha,Valor,ROC_7,ROC_30,momentum_7,momentum_30
10936,2025-09-17,948.57,-1.84,-1.38,-17.78,-13.27
10943,2025-09-24,952.87,0.45,-1.17,4.3,-11.28
10950,2025-10-01,962.39,1.0,-0.3,9.52,-2.89
10951,2025-10-02,959.19,0.62,-0.95,5.95,-9.16
10952,2025-10-03,961.54,0.54,-1.29,5.12,-12.59


## 6. Features de Volatilidad

In [8]:
df = add_volatility_features(df)

print("Features de volatilidad creadas:")
vol_cols = [c for c in df.columns if c.startswith(('volatility_', 'range_'))]
print(vol_cols)

# Mostrar features de volatilidad
cols_to_show = ['Fecha', 'Valor'] + vol_cols[:3]  # Mostrar primeras 3
df[cols_to_show].dropna().tail()

Features de volatilidad creadas:
['volatility_7', 'volatility_30', 'range_7', 'range_30']


Unnamed: 0,Fecha,Valor,volatility_7,volatility_30,range_7
10949,2025-09-30,961.24,0.23,0.38,8.37
10950,2025-10-01,962.39,0.13,0.37,9.15
10951,2025-10-02,959.19,0.3,0.37,5.97
10952,2025-10-03,961.54,0.27,0.34,3.49
10955,2025-10-06,963.17,0.27,0.34,3.98


## 7. Resumen de Features

In [9]:
print("=" * 60)
print("RESUMEN DE FEATURES CREADAS")
print("=" * 60)

feature_groups = {
    'Lag Features': [c for c in df.columns if 'lag' in c.lower()],
    'Moving Averages': [c for c in df.columns if 'MA_' in c],
    'Temporales': [c for c in df.columns if c in ['day_of_week', 'month', 'quarter', 'year']],
    'Indicadores Técnicos': [c for c in df.columns if c.startswith(('ROC_', 'momentum_', 'daily_'))],
    'Volatilidad': [c for c in df.columns if c.startswith(('volatility_', 'range_'))]
}

for group, features in feature_groups.items():
    print(f"\n{group} ({len(features)}):")
    for f in features:
        print(f"  - {f}")

total = sum(len(f) for f in feature_groups.values())
print(f"\n{'='*60}")
print(f"TOTAL FEATURES: {total}")
print(f"Registros con features completas: {df.dropna().shape[0]:,}")
print(f"{'='*60}")

RESUMEN DE FEATURES CREADAS

Lag Features (3):
  - Valor_lag_1
  - Valor_lag_7
  - Valor_lag_30

Moving Averages (3):
  - MA_7
  - MA_30
  - MA_90

Temporales (4):
  - day_of_week
  - month
  - quarter
  - year

Indicadores Técnicos (5):
  - ROC_7
  - ROC_30
  - momentum_7
  - momentum_30
  - daily_return

Volatilidad (4):
  - volatility_7
  - volatility_30
  - range_7
  - range_30

TOTAL FEATURES: 19
Registros con features completas: 3,993


## 8. Guardar Dataset Procesado

In [10]:
df_final = df.dropna()

print(f"Dataset final para ML:")
print(f"  - Registros: {len(df_final):,}")
print(f"  - Features: {len(df_final.columns)}")
print(f"  - Período: {df_final['Fecha'].min()} a {df_final['Fecha'].max()}")

output_path = Path('../data/processed/dolar_features.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)
df_final.to_csv(output_path, index=False)

print(f"\nDataset guardado en: {output_path}")

Dataset final para ML:
  - Registros: 3,993
  - Features: 25
  - Período: 1995-11-09 00:00:00 a 2025-10-03 00:00:00

Dataset guardado en: ../data/processed/dolar_features.csv


In [11]:
df_final.head()

Unnamed: 0,Fecha,Valor,statusCode,Valor_lag_1,Valor_lag_7,Valor_lag_30,MA_7,MA_30,MA_90,day_of_week,day_of_month,month,quarter,year,is_month_start,is_month_end,ROC_7,ROC_30,momentum_7,momentum_30,daily_return,volatility_7,volatility_30,range_7,range_30
31,1995-11-09,412.66,OK,413.93,415.07,402.76,415.43,410.46,409.7,3,9,11,4,1995,0,0,-0.58,2.46,-2.41,9.9,-0.31,0.38,0.54,5.37,14.92
32,1995-11-10,412.13,OK,412.66,416.31,403.89,414.59,410.87,409.81,4,10,11,4,1995,0,0,-1.0,2.04,-4.18,8.24,-0.13,0.18,0.54,5.9,14.92
37,1995-11-15,405.76,OK,408.57,413.93,403.83,410.46,411.32,409.74,2,15,11,4,1995,0,0,-1.97,0.48,-8.17,1.93,-0.69,0.44,0.61,7.43,14.92
38,1995-11-16,405.33,OK,405.76,412.66,403.11,409.0,411.42,409.57,3,16,11,4,1995,0,0,-1.78,0.55,-7.33,2.22,-0.11,0.49,0.61,7.86,14.07
39,1995-11-17,407.75,OK,405.33,412.13,403.96,408.12,411.6,409.51,4,17,11,4,1995,0,0,-1.06,0.94,-4.38,3.79,0.6,0.74,0.63,7.86,12.7


In [12]:
df_final.tail()

Unnamed: 0,Fecha,Valor,statusCode,Valor_lag_1,Valor_lag_7,Valor_lag_30,MA_7,MA_30,MA_90,day_of_week,day_of_month,month,quarter,year,is_month_start,is_month_end,ROC_7,ROC_30,momentum_7,momentum_30,daily_return,volatility_7,volatility_30,range_7,range_30
10936,2025-09-17,948.57,OK,951.14,966.35,961.84,954.5,964.1,957.65,2,17,9,3,2025,0,0,-1.84,-1.38,-17.78,-13.27,-0.27,0.21,0.38,14.16,25.56
10943,2025-09-24,952.87,OK,954.72,948.57,964.15,952.87,962.06,958.4,2,24,9,3,2025,0,0,0.45,-1.17,4.3,-11.28,-0.19,0.41,0.38,3.69,25.56
10950,2025-10-01,962.39,OK,961.24,952.87,965.28,958.44,960.22,960.72,2,1,10,4,2025,1,0,1.0,-0.3,9.52,-2.89,0.12,0.13,0.37,9.15,25.56
10951,2025-10-02,959.19,OK,962.39,953.24,968.35,959.63,959.76,961.24,3,2,10,4,2025,0,0,0.62,-0.95,5.95,-9.16,-0.33,0.3,0.37,5.97,25.56
10952,2025-10-03,961.54,OK,959.19,956.42,974.13,960.65,959.14,961.25,4,3,10,4,2025,0,0,0.54,-1.29,5.12,-12.59,0.24,0.27,0.34,3.49,22.65


## Conclusiones

- Feature engineering completado
- Dataset procesado con ~4,000 registros
- 23+ features listas para modelado
- Próximo paso: Implementar modelos (Random Forest, XGBoost, ARIMA)