In [13]:
# ============================ Level-29 / Clean & Robust ============================
# Triple-barrier labeling with robust price extraction (duplicates-safe), tz-safe, adaptive events
# ==================================================================================

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
from typing import Optional, Tuple

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# --------------------------- Config ---------------------------
TICKER        = "AAPL"
YEARS         = 3
FREQ          = "1d"       # switch to "1h"/"1m" if needed (consider increasing YEARS for more labels)
VOL_SPAN      = 50
CUSUM_MIN_EVT = 50
BASE_H        = 10
UP_M, DN_M    = 1.0, 1.0

# --------------------------- Utilities ---------------------------
def ensure_series(x: pd.Series, name: Optional[str] = None) -> pd.Series:
    if isinstance(x, pd.DataFrame):
        # squeeze 1st column deterministically
        x = x.iloc[:, 0]
    if not isinstance(x, pd.Series):
        raise TypeError("Expected a pandas Series.")
    if name:
        x = x.rename(name)
    x = pd.to_numeric(x, errors="coerce")
    x = x.dropna()
    x = x[~x.index.duplicated(keep="last")].sort_index()
    return x

def tz_to_naive_eastern(idx: pd.DatetimeIndex) -> pd.DatetimeIndex:
    if idx.tz is None:
        return idx
    return idx.tz_convert("US/Eastern").tz_localize(None)

def extract_close_series(df: pd.DataFrame) -> pd.Series:
    """Return a clean Close series even if columns are duplicated."""
    candidates = []
    if "Adj Close" in df.columns:
        candidates.append("Adj Close")
    if "Close" in df.columns:
        candidates.append("Close")
    if not candidates:
        raise RuntimeError("No Close/Adj Close column in downloaded data.")

    # Prefer Adj Close; fall back to Close
    for col in candidates:
        s = df[col]
        if isinstance(s, pd.DataFrame):  # duplicate columns case
            s = s.iloc[:, 0]
        s = ensure_series(s, "Close")
        if not s.empty:
            return s
    raise RuntimeError("Unable to extract a non-empty Close series.")

def load_prices(ticker: str, years: int, freq: str = "1d") -> pd.Series:
    start = (datetime.utcnow() - timedelta(days=int(365*years + 10))).date()
    df = yf.download(ticker, start=str(start), interval=freq, auto_adjust=True, progress=False)
    if df.empty:
        raise RuntimeError("No data returned. Check ticker/frequency/network.")

    # Normalize index to US/Eastern tz-naive
    if isinstance(df.index, pd.DatetimeIndex):
        idx = df.index
        try:
            # if tz-aware, this will work; if tz-naive, it will raise -> handled by except
            idx = tz_to_naive_eastern(idx.tz_convert("UTC"))
        except Exception:
            if idx.tz is not None:
                idx = tz_to_naive_eastern(idx)
        df.index = idx

    px = extract_close_series(df)  # duplicates-safe path
    return px

def ewma_vol(rets: pd.Series, span: int = 50) -> pd.Series:
    r = ensure_series(rets, "r")
    v = r.ewm(span=span, adjust=False).std()
    return v.rename("vol")

def cusum_filter(r: pd.Series, threshold: float) -> pd.DatetimeIndex:
    r = ensure_series(r, "r")
    s_pos = 0.0
    s_neg = 0.0
    t_events = []
    for t, x in r.items():
        xf = float(x)
        s_pos = max(0.0, s_pos + xf)
        s_neg = min(0.0, s_neg + xf)
        if s_pos > threshold:
            s_pos = 0.0
            t_events.append(t)
        elif s_neg < -threshold:
            s_neg = 0.0
            t_events.append(t)
    return pd.DatetimeIndex(t_events)

def get_vertical_barriers(t_events: pd.DatetimeIndex, H: int, full_index: pd.DatetimeIndex) -> pd.Series:
    full_index = pd.DatetimeIndex(full_index)
    vbar = {}
    pos_map = {ts: i for i, ts in enumerate(full_index)}
    n = len(full_index)
    for t0 in t_events:
        if t0 in pos_map:
            i = pos_map[t0]
        else:
            i = full_index.searchsorted(t0, side="left")
        j = min(n - 1, i + int(H))
        vbar[t0] = full_index[j]
    return pd.Series(vbar, name="t1")

def first_cross_idx(path_vals: np.ndarray, level: float, cmp: str) -> Optional[int]:
    if cmp == "ge":
        mask = path_vals >= level
    elif cmp == "le":
        mask = path_vals <= level
    else:
        raise ValueError("cmp must be 'ge' or 'le'")
    where = np.where(mask)[0]
    return int(where[0]) if where.size else None

def get_triple_barrier_labels(
    close: pd.Series,
    t_events: pd.DatetimeIndex,
    vbar: pd.Series,
    up_m: float,
    dn_m: float,
    daily_vol: pd.Series
) -> pd.DataFrame:
    close = ensure_series(close, "Close")
    daily_vol = ensure_series(daily_vol, "vol")
    vbar = vbar.dropna()
    rows = []
    for t0 in t_events:
        if t0 not in close.index or t0 not in vbar.index:
            continue
        t1 = vbar.loc[t0]
        if t1 not in close.index:
            j = close.index.searchsorted(t1, side="left")
            j = max(0, min(j, len(close) - 1))
            t1 = close.index[j]
        c0 = float(close.loc[t0])
        trgt = float(daily_vol.get(t0, np.nan))
        if not np.isfinite(trgt) or trgt <= 0:
            continue

        up_lvl = c0 * (1 + up_m * trgt)
        dn_lvl = c0 * (1 - dn_m * trgt)

        # include t0 so a cross at the first bar can be detected consistently
        path_idx = close.loc[t0:t1].index
        path = close.loc[path_idx].to_numpy(dtype=float)

        iu = first_cross_idx(path, up_lvl, "ge")
        idn = first_cross_idx(path, dn_lvl, "le")

        label = 0
        t_hit = t1
        if iu is not None and idn is not None:
            if iu < idn:
                label = 1
                t_hit = path_idx[iu]
            elif idn < iu:
                label = -1
                t_hit = path_idx[idn]
            else:
                label = 0
                t_hit = path_idx[iu]
        elif iu is not None:
            label = 1
            t_hit = path_idx[iu]
        elif idn is not None:
            label = -1
            t_hit = path_idx[idn]
        else:
            rvt = float(close.loc[t1] / c0 - 1.0)
            label = 1 if rvt > 0 else (-1 if rvt < 0 else 0)
            t_hit = t1

        rows.append({"t0": t0, "t1": t1, "t_hit": t_hit, "label": label, "trgt": trgt})

    out = pd.DataFrame.from_records(rows).set_index("t0").sort_index()
    return out

def build_features(px: pd.Series) -> pd.DataFrame:
    px = ensure_series(px, "Close")
    r1 = px.pct_change().fillna(0.0)
    r5 = px.pct_change(5).fillna(0.0)
    vol20 = r1.rolling(20).std().fillna(0.0)
    mom10 = px.pct_change(10).fillna(0.0)
    ma10 = px.rolling(10).mean()
    ma20 = px.rolling(20).mean()
    ma_ratio = (ma10 / ma20 - 1.0).replace([np.inf, -np.inf], 0.0).fillna(0.0)
    feat = pd.DataFrame({"r1": r1, "r5": r5, "vol20": vol20, "mom10": mom10, "ma_ratio": ma_ratio}, index=px.index)
    return feat

def adaptive_events_and_labels(
    close: pd.Series,
    rets: pd.Series,
    vol_span: int = 50,
    min_events: int = 50,
    base_H: int = 10,
    up_m: float = 1.0,
    dn_m: float = 1.0
) -> Tuple[pd.DataFrame, pd.Series, pd.DatetimeIndex, int, float]:
    close = ensure_series(close, "Close")
    rets = ensure_series(rets, "r")
    daily_vol = ewma_vol(rets, span=vol_span).clip(lower=1e-8).fillna(0.0)

    vol_std = float(rets.std()) if np.isfinite(float(rets.std())) else 0.01
    thr_fracs = [0.50, 0.35, 0.25, 0.18, 0.12, 0.08, 0.05, 0.03]
    H_choices = [base_H, int(base_H*1.5), base_H*2, base_H*3]

    labels = pd.DataFrame()
    used_thr, used_H, events_idx = None, None, pd.DatetimeIndex([])

    for H_try in H_choices:
        for frac in thr_fracs:
            thr = max(1e-4, frac * vol_std)
            events_idx = cusum_filter(rets, thr)
            if len(events_idx) == 0:
                continue
            vbar = get_vertical_barriers(events_idx, H_try, close.index)
            lab = get_triple_barrier_labels(close, events_idx, vbar, up_m, dn_m, daily_vol).dropna()
            if len(lab) >= min_events:
                labels = lab
                used_thr, used_H = thr, H_try
                break
        if len(labels) >= min_events:
            break

    if labels.empty:
        for H_try in H_choices[max(1, len(H_choices)//2):]:
            for frac in thr_fracs + [0.02, 0.015, 0.01]:
                thr = max(5e-5, frac * vol_std)
                events_idx = cusum_filter(rets, thr)
                if len(events_idx) == 0:
                    continue
                vbar = get_vertical_barriers(events_idx, H_try, close.index)
                lab = get_triple_barrier_labels(close, events_idx, vbar, up_m*0.5, dn_m*0.5, daily_vol).dropna()
                if len(lab) >= max(10, min_events//2):
                    labels = lab
                    used_thr, used_H = thr, H_try
                    break
            if not labels.empty:
                break

    return labels, daily_vol, events_idx, (used_H or base_H), float(used_thr or 0.0)

# --------------------------- Main ---------------------------
px = load_prices(TICKER, YEARS, FREQ)
rets = px.pct_change().replace([np.inf, -np.inf], 0.0).fillna(0.0)

labels, daily_vol, events_idx, H_used, thr_used = adaptive_events_and_labels(
    close=px,
    rets=rets,
    vol_span=VOL_SPAN,
    min_events=CUSUM_MIN_EVT,
    base_H=BASE_H,
    up_m=UP_M,
    dn_m=DN_M
)

print(f"[Adaptive] events={len(events_idx)} labels={len(labels)}  H={H_used}  thr≈{thr_used:.6g}")

if labels.empty:
    raise RuntimeError(
        "Adaptive labeling still produced no labels. "
        "Increase YEARS, switch to higher frequency, or lower CUSUM_MIN_EVT."
    )

# Features & target (align on t0)
X_all = build_features(px)
X_evt = X_all.reindex(labels.index).dropna()
y = (labels["label"] == 1).astype(int).reindex(X_evt.index)

# Time split
cut = int(len(X_evt) * 0.7)
X_tr, X_te = X_evt.iloc[:cut], X_evt.iloc[cut:]
y_tr, y_te = y.iloc[:cut], y.iloc[cut:]

# Scale & model
scaler = StandardScaler(with_mean=True, with_std=True)
X_tr_sc = pd.DataFrame(scaler.fit_transform(X_tr), index=X_tr.index, columns=X_tr.columns)
X_te_sc = pd.DataFrame(scaler.transform(X_te), index=X_te.index, columns=X_te.columns)

clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"
)
clf.fit(X_tr_sc, y_tr)
pred = clf.predict(X_te_sc)
proba = clf.predict_proba(X_te_sc)[:, 1]

print("\n=== Classification Report (long = 1) ===")
print(classification_report(y_te, pred, digits=3))

out = pd.DataFrame({"y_true": y_te, "y_pred": pred, "proba": proba}, index=X_te_sc.index)
out.to_csv(f"{TICKER}_labels_predictions.csv", index_label="date")
print(f"Saved predictions to {TICKER}_labels_predictions.csv")


[Adaptive] events=410 labels=410  H=10  thr≈0.00867296

=== Classification Report (long = 1) ===
              precision    recall  f1-score   support

           0      0.417     0.328     0.367        61
           1      0.453     0.548     0.496        62

    accuracy                          0.439       123
   macro avg      0.435     0.438     0.432       123
weighted avg      0.435     0.439     0.432       123

Saved predictions to AAPL_labels_predictions.csv
