# 01 — EDA & Preprocessing (Ticker-Agnostic)

This notebook:
- Reads `TICKER` from environment (no hardcoded tickers)
- Downloads OHLCV data via `yfinance`
- Writes raw CSV → `data/raw/{TICKER}.csv`
- Runs validation + EDA (plots)
- Engineers features + time-aware split
- Writes processed CSV → `data/processed/{TICKER}.csv`
- Writes EDA log JSON → `data/logs/{TICKER}_eda.json`

In [None]:
# Parameters (injected by backend/runner)
import os

TICKER = os.environ.get("TICKER")
if not TICKER:
    raise ValueError("TICKER env var is required (e.g., set TICKER=AAPL)")

TICKER = TICKER.strip().upper()
print("TICKER:", TICKER)

In [None]:
# Imports
import json
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import yfinance as yf

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 5)

In [None]:
# Paths (repo-root relative)
ROOT = Path(".").resolve()
DATA_DIR = ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
LOG_DIR = DATA_DIR / "logs"

for d in (RAW_DIR, PROC_DIR, LOG_DIR):
    d.mkdir(parents=True, exist_ok=True)

raw_path = RAW_DIR / f"{TICKER}.csv"
proc_path = PROC_DIR / f"{TICKER}.csv"
eda_log_path = LOG_DIR / f"{TICKER}_eda.json"

raw_path, proc_path, eda_log_path

## 1) Ingestion (yfinance)

We keep ingestion deterministic and rerunnable: if raw file exists, we overwrite it from source.

In [None]:
df = yf.download(TICKER, period="max", auto_adjust=False, progress=False)
if df is None or df.empty:
    raise RuntimeError(f"No data returned for ticker: {TICKER}")

df = df.reset_index().rename(columns={"Date": "date"})
df["date"] = pd.to_datetime(df["date"], utc=False)

# Standardize column names
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

df.head()

In [None]:
# Save raw
df.to_csv(raw_path, index=False)
print("Saved raw:", raw_path)

## 2) Validation
- duplicates
- nulls
- missing business dates (approx. check)

In [None]:
val = {}

# duplicates by date
dup_count = int(df.duplicated(subset=["date"]).sum())
val["duplicate_dates"] = dup_count

# null counts
null_counts = df.isna().sum().to_dict()
val["null_counts"] = {k: int(v) for k, v in null_counts.items()}

# missing business days (rough; markets have holidays)
dmin, dmax = df["date"].min(), df["date"].max()
expected = pd.date_range(dmin.normalize(), dmax.normalize(), freq="B")
observed = pd.to_datetime(df["date"].dt.normalize().unique())
missing = sorted(set(expected) - set(observed))
val["missing_business_days_count"] = int(len(missing))
val["date_range"] = {"min": dmin.isoformat(), "max": dmax.isoformat()}

val

## 3) EDA
- price + volume
- returns distribution
- rolling mean + volatility

In [None]:
close_col = "adj_close" if "adj_close" in df.columns else "close"

fig, ax1 = plt.subplots()
ax1.plot(df["date"], df[close_col], label=close_col)
ax1.set_title(f"{TICKER} — Price")
ax1.set_xlabel("Date")
ax1.set_ylabel("Price")
plt.show()

fig, ax2 = plt.subplots()
ax2.bar(df["date"], df.get("volume", pd.Series([0]*len(df))), width=1.0)
ax2.set_title(f"{TICKER} — Volume")
ax2.set_xlabel("Date")
ax2.set_ylabel("Volume")
plt.show()

In [None]:
ts = df[["date", close_col]].copy().sort_values("date")
ts["returns"] = ts[close_col].pct_change()

sns.histplot(ts["returns"].dropna(), bins=100, kde=True)
plt.title(f"{TICKER} — Daily Returns Distribution")
plt.show()

In [None]:
window = 20
ts["rolling_mean"] = ts[close_col].rolling(window).mean()
ts["rolling_vol"] = ts["returns"].rolling(window).std() * np.sqrt(252)

fig, ax = plt.subplots()
ax.plot(ts["date"], ts[close_col], label="price", alpha=0.6)
ax.plot(ts["date"], ts["rolling_mean"], label=f"{window}d rolling mean")
ax.set_title(f"{TICKER} — Rolling Mean")
ax.legend()
plt.show()

fig, ax = plt.subplots()
ax.plot(ts["date"], ts["rolling_vol"], label=f"{window}d annualized vol")
ax.set_title(f"{TICKER} — Rolling Volatility")
ax.legend()
plt.show()

## 4) Log summary stats → `data/logs/{TICKER}_eda.json`

In [None]:
summary = {
    "ticker": TICKER,
    "generated_at_utc": datetime.utcnow().isoformat() + "Z",
    "rows": int(len(df)),
    "columns": list(df.columns),
    "validation": val,
    "price_summary": {
        "close_col": close_col,
        "min": float(ts[close_col].min()),
        "max": float(ts[close_col].max()),
        "mean": float(ts[close_col].mean()),
        "std": float(ts[close_col].std()),
    },
    "returns_summary": {
        "mean": float(ts["returns"].mean(skipna=True)),
        "std": float(ts["returns"].std(skipna=True)),
        "skew": float(ts["returns"].dropna().skew()),
        "kurt": float(ts["returns"].dropna().kurt()),
    },
}

eda_log_path.write_text(json.dumps(summary, indent=2))
print("Saved EDA log:", eda_log_path)

## 5) Preprocessing + Feature Engineering

Features:
- returns
- rolling averages
- rolling volatility

Then a time-aware split and save to `data/processed/{TICKER}.csv`.

In [None]:
proc = df[["date", close_col] + (["volume"] if "volume" in df.columns else [])].copy()
proc = proc.sort_values("date").drop_duplicates(subset=["date"]).reset_index(drop=True)

# basic cleaning
proc[close_col] = proc[close_col].astype(float)
if "volume" in proc.columns:
    proc["volume"] = pd.to_numeric(proc["volume"], errors="coerce")

proc = proc.dropna(subset=["date", close_col])

# features
proc["returns"] = proc[close_col].pct_change()
for w in (5, 10, 20):
    proc[f"roll_mean_{w}"] = proc[close_col].rolling(w).mean()
    proc[f"roll_vol_{w}"] = proc["returns"].rolling(w).std()

# final clean
proc = proc.replace([np.inf, -np.inf], np.nan)
proc = proc.dropna().reset_index(drop=True)

# time-aware split (last 20% for validation)
n = len(proc)
split_idx = int(n * 0.8)
proc["split"] = "train"
proc.loc[split_idx:, "split"] = "val"

proc.to_csv(proc_path, index=False)
print("Saved processed:", proc_path)
proc.head()