# Feature Engineering

In [37]:
import os
from pathlib import Path
from datetime import datetime
import requests
import pandas as pd
from dotenv import load_dotenv
load_dotenv() 
API_KEY = os.getenv("ALPHA_VANTAGE_KEY")
SYMBOL  = "CME"  

assert API_KEY, "Missing ALPHA_VANTAGE_KEY in .env"
    
ROOT = Path("..").resolve()  
RAW  = ROOT / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

# ---------- request ----------
params = {
    "function": "TIME_SERIES_DAILY",    
    "symbol": SYMBOL,
    "outputsize": "compact",
    "apikey": API_KEY,
}

resp = requests.get("https://www.alphavantage.co/query", params=params, timeout=30)
resp.raise_for_status()
data = resp.json()

# Handle API errors / throttling messages
if "Error Message" in data:
    raise RuntimeError(f"Alpha Vantage error: {data['Error Message']}")
if "Note" in data:
    raise RuntimeError(f"Alpha Vantage notice (likely rate limit): {data['Note']}")

ts_key = "Time Series (Daily)"
assert ts_key in data, f"Unexpected response keys: {list(data.keys())}"

# ---------- normalize ----------
df_api = pd.DataFrame(data[ts_key]).T
df_api.index.name = "date"
df_api = df_api.rename(columns={
    "1. open": "open",
    "2. high": "high",
    "3. low":  "low",
    "4. close": "close",
    "5. volume": "volume",
})

# types + sort
num_cols = ["open", "high", "low", "close", "volume"]
df_api[num_cols] = df_api[num_cols].apply(pd.to_numeric, errors="coerce")
df_api = df_api.reset_index()
df_api["date"] = pd.to_datetime(df_api["date"])
df_api = df_api.sort_values("date").reset_index(drop=True)

# ---------- validate ----------
assert not df_api.empty, "API returned empty dataset — check symbol or connectivity"
assert set(num_cols).issubset(df_api.columns), f"Missing cols: {set(num_cols) - set(df_api.columns)}"

# ---------- save ----------
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
out_path = RAW / f"api_alphavantage_{SYMBOL}_{timestamp}.csv"
df_api.to_csv(out_path, index=False)
print("Saved:", out_path.as_posix())

df = df_api


Saved: /Users/yuyuezhu/bootcamp_carolina_zhu/project/data/raw/api_alphavantage_CME_20250826-1928.csv


In [39]:
df['daily_return'] = df['close'].pct_change()
df

Unnamed: 0,date,open,high,low,close,volume,daily_return
0,2025-04-03,264.800,271.220,263.4800,268.36,4242858,
1,2025-04-04,269.445,273.420,254.1900,254.46,5819157,-0.051796
2,2025-04-07,254.340,259.420,250.1652,252.37,5472796,-0.008213
3,2025-04-08,256.540,258.850,252.0000,255.03,5514600,0.010540
4,2025-04-09,252.790,259.380,248.5300,254.13,6336298,-0.003529
...,...,...,...,...,...,...,...
95,2025-08-20,272.990,275.585,271.5250,274.61,1738747,0.011492
96,2025-08-21,275.000,275.610,273.0350,273.32,1325711,-0.004698
97,2025-08-22,274.140,274.950,268.6600,269.52,1591625,-0.013903
98,2025-08-25,269.580,271.660,268.5400,271.39,1712588,0.006938


In [41]:
df['close_ma_5'] = df['close'].rolling(5).mean()
df

Unnamed: 0,date,open,high,low,close,volume,daily_return,close_ma_5
0,2025-04-03,264.800,271.220,263.4800,268.36,4242858,,
1,2025-04-04,269.445,273.420,254.1900,254.46,5819157,-0.051796,
2,2025-04-07,254.340,259.420,250.1652,252.37,5472796,-0.008213,
3,2025-04-08,256.540,258.850,252.0000,255.03,5514600,0.010540,
4,2025-04-09,252.790,259.380,248.5300,254.13,6336298,-0.003529,256.870
...,...,...,...,...,...,...,...,...
95,2025-08-20,272.990,275.585,271.5250,274.61,1738747,0.011492,273.026
96,2025-08-21,275.000,275.610,273.0350,273.32,1325711,-0.004698,272.750
97,2025-08-22,274.140,274.950,268.6600,269.52,1591625,-0.013903,271.870
98,2025-08-25,269.580,271.660,268.5400,271.39,1712588,0.006938,272.066


In [43]:
df['close_lag1'] = df['close'].shift(1)
df

Unnamed: 0,date,open,high,low,close,volume,daily_return,close_ma_5,close_lag1
0,2025-04-03,264.800,271.220,263.4800,268.36,4242858,,,
1,2025-04-04,269.445,273.420,254.1900,254.46,5819157,-0.051796,,268.36
2,2025-04-07,254.340,259.420,250.1652,252.37,5472796,-0.008213,,254.46
3,2025-04-08,256.540,258.850,252.0000,255.03,5514600,0.010540,,252.37
4,2025-04-09,252.790,259.380,248.5300,254.13,6336298,-0.003529,256.870,255.03
...,...,...,...,...,...,...,...,...,...
95,2025-08-20,272.990,275.585,271.5250,274.61,1738747,0.011492,273.026,271.49
96,2025-08-21,275.000,275.610,273.0350,273.32,1325711,-0.004698,272.750,274.61
97,2025-08-22,274.140,274.950,268.6600,269.52,1591625,-0.013903,271.870,273.32
98,2025-08-25,269.580,271.660,268.5400,271.39,1712588,0.006938,272.066,269.52
