In [2]:
# src/preprocessing/smci_preprocessing.py

import numpy as np
import pandas as pd
import yfinance as yf


def download_raw_smci(
    ticker: str = "SMCI",
    start: str = "2017-01-01",
) -> pd.DataFrame:
    """Download raw daily OHLCV data from Yahoo Finance."""
    df = yf.download(ticker, start=start, progress=False)
    df = df[["Open", "High", "Low", "Close", "Volume"]].copy()
    df = df.sort_index()
    return df


def clean_ohlcv(df: pd.DataFrame) -> pd.DataFrame:
    """Flatten columns if needed, drop NaNs, fix zero volumes."""
    df = df.copy()

    # 1) Flatten columns: handle MultiIndex or tuple columns
    new_cols = []
    for c in df.columns:
        if isinstance(c, tuple):
            for part in c:
                if part not in (None, "", " "):
                    new_cols.append(str(part))
                    break
            else:
                new_cols.append("col")
        else:
            new_cols.append(str(c))
    df.columns = new_cols

    expected_cols = ["Open", "High", "Low", "Close", "Volume"]
    missing = [c for c in expected_cols if c not in df.columns]
    if missing:
        raise ValueError(
            f"Expected columns {expected_cols}, but these are missing: {missing}."
        )

    df = df[expected_cols].copy()
    df = df.sort_index()

    # Drop rows with missing OHLCV
    df = df.dropna(subset=expected_cols)

    # Replace zero volume with NaN, forward-fill, then drop remaining NaNs in Volume
    df["Volume"] = df["Volume"].replace(0, np.nan).ffill()
    df = df.dropna(subset=["Volume"])

    return df


def build_features(df_clean: pd.DataFrame):
    """Create returns, rolling means/vol, and next-day target, then df_model."""
    df = df_clean.copy()

    # Log close
    df["log_close"] = np.log(df["Close"])

    # 1-day log return
    df["ret_1d"] = df["log_close"].diff()

    # Rolling mean returns
    windows = [3, 5, 10, 21]
    for w in windows:
        df[f"ret_mean_{w}d"] = df["ret_1d"].rolling(window=w).mean()

    # 10-day rolling volatility
    df["ret_vol_10d"] = df["ret_1d"].rolling(window=10).std()

    # Log volume + 1-day change
    df["log_vol"] = np.log(df["Volume"])
    df["log_vol_chg_1d"] = df["log_vol"].diff()

    # Target: next-day log return
    df["target_next_log_ret"] = df["log_close"].shift(-1) - df["log_close"]

    feature_cols = [
        "ret_1d",
        "ret_mean_3d",
        "ret_mean_5d",
        "ret_mean_10d",
        "ret_mean_21d",
        "ret_vol_10d",
        "log_vol_chg_1d",
    ]

    df_tmp = df[feature_cols + ["target_next_log_ret"]].copy()

    # Drop rows with missing target and features
    df_tmp = df_tmp[df_tmp["target_next_log_ret"].notna()]
    df_model = df_tmp.dropna().copy()

    return df_model, feature_cols


def make_time_splits(df_model: pd.DataFrame, feature_cols, train_frac=0.6, val_frac=0.2):
    """Create time-based 60/20/20 split for regression and classification."""
    X = df_model[feature_cols].values
    y = df_model["target_next_log_ret"].values
    dates = df_model.index

    n = len(df_model)
    train_end = int(train_frac * n)
    val_end = int((train_frac + val_frac) * n)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
    X_test,  y_test  = X[val_end:],          y[val_end:]

    dates_train = dates[:train_end]
    dates_val   = dates[train_end:val_end]
    dates_test  = dates[val_end:]

    # Classification labels: 1 if next-day return > 0
    y_class = (df_model["target_next_log_ret"].values > 0).astype(int)
    y_class_train = y_class[:train_end]
    y_class_val   = y_class[train_end:val_end]
    y_class_test  = y_class[val_end:]

    splits = {
        "X_train": X_train,
        "X_val": X_val,
        "X_test": X_test,
        "y_train": y_train,
        "y_val": y_val,
        "y_test": y_test,
        "y_class_train": y_class_train,
        "y_class_val": y_class_val,
        "y_class_test": y_class_test,
        "dates_train": dates_train,
        "dates_val": dates_val,
        "dates_test": dates_test,
        "train_end": train_end,
        "val_end": val_end,
    }
    return splits


def load_smci_dataset(
    ticker="SMCI",
    start="2017-01-01",
    train_frac=0.6,
    val_frac=0.2,
):
    """Full preprocessing pipeline: download, clean, build features, split."""
    df_raw = download_raw_smci(ticker=ticker, start=start)
    df_clean = clean_ohlcv(df_raw)
    df_model, feature_cols = build_features(df_clean)
    splits = make_time_splits(df_model, feature_cols, train_frac, val_frac)
    return df_model, feature_cols, splits
