# forex_signal_v29

PF-first Top-down Multi-Timeframe Ensemble (robust loader + resampling fallbacks)

**Trade TF:** 15m (decision at close(t), entry at open(t+1))

**Macro TF:** 1H & 4H (from files if present; otherwise resampled from 15m)

**Entry TF:** 1m (from file if present; otherwise approximated from 5m/15m)

This notebook is designed to **never silently produce 0 trades** without telling you why. It prints diagnostics at each stage.

In [5]:

# ===== 0) Imports & Config =====
import os, glob, math, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# Optional models
HAS_XGB = False
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    HAS_XGB = False

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV

class CFG:
    # Update these paths to match your machine
    # Stepped up from ml_models_train/ to root
    train_dir = "../data/train"
    test_dir  = "../data/test"
    pair_hint = "EURUSD"  # used to auto-pick files

    # execution costs (set realistically!)
    spread_pips: float = 1.0
    slippage_pips: float = 0.2
    commission_per_lot_usd: float = 0.0   # round-trip commission

    # risk
    risk_per_trade: float = 0.01

    # triple-barrier defaults (tuned on validation later)
    tp_atr: float = 2.0
    sl_atr: float = 1.5
    max_hold_bars: int = 16  # on 15m => 4h

    # initial thresholds (will be tuned)
    edge_th: float = 0.60
    entry_th: float = 0.55

    # validation split inside train
    core_end = "2021-12-31"
    val_end  = "2023-12-31"

cfg = CFG()
print("HAS_XGB:", HAS_XGB)
print("CFG:", {k:getattr(cfg,k) for k in dir(cfg) if not k.startswith('_') and k.islower()})


HAS_XGB: True
CFG: {'commission_per_lot_usd': 0.0, 'core_end': '2021-12-31', 'edge_th': 0.6, 'entry_th': 0.55, 'max_hold_bars': 16, 'pair_hint': 'EURUSD', 'risk_per_trade': 0.01, 'sl_atr': 1.5, 'slippage_pips': 0.2, 'spread_pips': 1.0, 'test_dir': '../data/test', 'tp_atr': 2.0, 'train_dir': '../data/train', 'val_end': '2023-12-31'}


In [6]:

# ===== 1) Robust CSV loader =====
import re

def _guess_tf_from_name(name: str) -> str:
    n = name.lower()
    
    # Check for specific patterns with boundaries or clear delimiters
    # e.g. _15m, m15.csv, -15m-
    
    # Strict regex checks first
    # 15m, 1m etc.
    if re.search(r"(?:_|-|\b)15m(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)m15(?:_|-|\b|\.)", n):
        return "15m"
    if re.search(r"(?:_|-|\b)1m(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)m1(?:_|-|\b|\.)", n):
        return "1m"
    if re.search(r"(?:_|-|\b)5m(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)m5(?:_|-|\b|\.)", n):
        return "5m"
    if re.search(r"(?:_|-|\b)30m(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)m30(?:_|-|\b|\.)", n):
        return "30m"
    if re.search(r"(?:_|-|\b)1h(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)h1(?:_|-|\b|\.)", n):
        return "1h"
    if re.search(r"(?:_|-|\b)4h(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)h4(?:_|-|\b|\.)", n):
        return "4h"
    if re.search(r"(?:_|-|\b)1d(?:_|-|\b|\.)", n) or re.search(r"(?:_|-|\b)d1(?:_|-|\b|\.)", n):
        return "1d"
        
    # Fallback to simple containment but ordered by length
    mapping = {
        "15m": "15m", "m15": "15m",
        "30m": "30m", "m30": "30m",
        "4h": "4h", "h4": "4h",
        "1h": "1h", "h1": "1h",
        "5m": "5m", "m5": "5m",
        "1m": "1m", "m1": "1m"
    }
    
    # Sort keys by length descending to avoid partial matches (e.g. m1 matching m15)
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    
    for k in sorted_keys:
        if k in n:
            return mapping[k]
            
    return ""


def _pick_files(data_dir: str, pair_hint: str):
    # Also look for parent dir if data_dir is relative
    if not os.path.exists(data_dir):
        # try try stepping up
        if os.path.exists(os.path.join("..", data_dir)):
             data_dir = os.path.join("..", data_dir)
             
    files = glob.glob(os.path.join(data_dir, "*.csv")) + glob.glob(os.path.join(data_dir, "*.parquet"))
    if not files:
        raise FileNotFoundError(f"No data files found in {data_dir}. Put your CSVs there.")
        
    # keep only those containing pair hint if possible
    hinted = [f for f in files if pair_hint.lower() in os.path.basename(f).lower()]
    if hinted:
        files = hinted
        
    by_tf = {}
    for f in files:
        tf = _guess_tf_from_name(os.path.basename(f))
        if tf:
            # If duplicates, prefer the one with clearer naming or just overwrite
            if tf not in by_tf:
                by_tf[tf] = f
            else:
                # heuristic: prefer shorter filename (often cleaner)
                if len(os.path.basename(f)) < len(os.path.basename(by_tf[tf])):
                    by_tf[tf] = f
    return by_tf

def _standardize_ohlc(df: pd.DataFrame) -> pd.DataFrame:
    # Accept common column variants
    cols = {c.lower():c for c in df.columns}
    def pick(*names):
        for n in names:
            if n in cols: return cols[n]
        return None

    tcol = pick("time","timestamp","date","datetime")
    if tcol is None:
        # Fallback: assume first column if it looks like time? 
        # Or if columns are ['<DATE>', '<TIME>', ...]
        # Let's try to be smart.
        if "<DATE>" in df.columns: # MT4 export
             df["timestamp"] = df["<DATE>"] + " " + df["<TIME>"]
             tcol = "timestamp"
        elif len(df.columns) >= 5: # Blind guess: col 0
             tcol = df.columns[0]
        else:
            raise ValueError(f"Couldn't find a timestamp column. Have: {list(df.columns)[:20]}")
            
    df = df.rename(columns={tcol:"time"})
    df["time"] = pd.to_datetime(df["time"], utc=True, errors="coerce")
    df = df.dropna(subset=["time"]).sort_values("time").drop_duplicates("time")
    
    # OHLC
    for need, opts in [("open",("open","o","<open>")),("high",("high","h","<high>")),("low",("low","l","<low>")),("close",("close","c","<close>"))]:
        c = pick(*opts)
        if c is None:
             # Blind guess if 5 cols?
             if len(df.columns) >= 5 and need in ["open","high","low","close"]:
                 # map indices 1,2,3,4 to OHLC
                 idx_map = {"open":1, "high":2, "low":3, "close":4}
                 if idx_map[need] < len(df.columns):
                     c = df.columns[idx_map[need]]
        
        if c is None:
            raise ValueError(f"Missing {need} column. Have: {list(df.columns)[:20]}")
        df = df.rename(columns={c:need})
        
    # optional volume
    v = pick("volume","vol","tick_volume","<vol>","<tickvol>")
    if v:
        df = df.rename(columns={v:"volume"})
    else:
        df["volume"] = np.nan
        
    df = df.set_index("time")
    df = df[["open","high","low","close","volume"]].astype(float)
    return df

def load_tf(data_dir: str, pair_hint: str):
    by_tf = _pick_files(data_dir, pair_hint)
    out = {}
    for tf in ["1m","5m","15m","30m","1h","4h"]:
        if tf in by_tf:
            f = by_tf[tf]
            try:
                if f.endswith(".parquet"):
                    df = pd.read_parquet(f)
                else:
                    df = pd.read_csv(f)
                out[tf] = _standardize_ohlc(df)
                print(f"Loaded {tf}: {os.path.basename(f)}  rows={len(out[tf])}")
            except Exception as e:
                print(f"Failed to load {f}: {e}")
                
    return out

train_tf = load_tf(cfg.train_dir, cfg.pair_hint)
test_tf  = load_tf(cfg.test_dir, cfg.pair_hint)

assert "15m" in train_tf and "15m" in test_tf, "Need at least 15m data files."


Loaded 1m: EURUSD_m1.csv  rows=3354904
Loaded 5m: EURUSD_m5.csv  rows=671581
Loaded 15m: EURUSD_m15.csv  rows=224382
Loaded 30m: EURUSD_m30.csv  rows=112194
Loaded 1h: EURUSD_h1.csv  rows=56098
Loaded 4h: EURUSD_h4.csv  rows=14498
Loaded 1m: EURUSD_m1.csv  rows=743476
Loaded 5m: EURUSD_m5.csv  rows=148502
Loaded 15m: EURUSD_m15.csv  rows=49807
Loaded 30m: EURUSD_m30.csv  rows=24907
Loaded 1h: EURUSD_h1.csv  rows=12454
Loaded 4h: EURUSD_h4.csv  rows=3220


In [7]:

# ===== 2) Resampling fallbacks (so macro features never go NaN silently) =====
def resample_ohlc(df: pd.DataFrame, rule: str) -> pd.DataFrame:
    o = df["open"].resample(rule).first()
    h = df["high"].resample(rule).max()
    l = df["low"].resample(rule).min()
    c = df["close"].resample(rule).last()
    v = df["volume"].resample(rule).sum() if "volume" in df else None
    out = pd.DataFrame({"open":o,"high":h,"low":l,"close":c})
    out["volume"] = v if v is not None else np.nan
    out = out.dropna(subset=["open","high","low","close"])
    return out

def ensure_tf(bundle: dict, tf: str, base_tf: str="15m"):
    if tf in bundle:
        return bundle[tf]
    base = bundle[base_tf]
    rule = {"1h":"1H","4h":"4H","30m":"30T","5m":"5T","1m":"1T"}[tf]
    # We only safely resample to *higher* tf from 15m here
    if tf in ("1h","4h","30m"):
        df = resample_ohlc(base, rule)
        bundle[tf] = df
        print(f"[fallback] Built {tf} by resampling {base_tf} -> {rule}. rows={len(df)}")
        return df
    return None

_ = ensure_tf(train_tf,"1h")
_ = ensure_tf(train_tf,"4h")
_ = ensure_tf(test_tf,"1h")
_ = ensure_tf(test_tf,"4h")


In [8]:

# ===== 3) Indicators (EMA, ATR, ADX, Bollinger width) =====
def ema(s: pd.Series, span: int) -> pd.Series:
    return s.ewm(span=span, adjust=False).mean()

def atr(df: pd.DataFrame, n: int=14) -> pd.Series:
    h,l,c = df["high"], df["low"], df["close"]
    prev_c = c.shift(1)
    tr = pd.concat([(h-l).abs(), (h-prev_c).abs(), (l-prev_c).abs()], axis=1).max(axis=1)
    return tr.rolling(n).mean()

def adx(df: pd.DataFrame, n: int=14) -> pd.Series:
    h,l,c = df["high"], df["low"], df["close"]
    up = h.diff()
    dn = -l.diff()
    plus_dm = np.where((up > dn) & (up > 0), up, 0.0)
    minus_dm = np.where((dn > up) & (dn > 0), dn, 0.0)
    tr = pd.concat([(h-l).abs(), (h-c.shift(1)).abs(), (l-c.shift(1)).abs()], axis=1).max(axis=1)
    atr_n = tr.rolling(n).sum()
    plus_di = 100 * pd.Series(plus_dm, index=df.index).rolling(n).sum() / atr_n
    minus_di= 100 * pd.Series(minus_dm, index=df.index).rolling(n).sum() / atr_n
    dx = (100 * (plus_di - minus_di).abs() / (plus_di + minus_di)).replace([np.inf,-np.inf], np.nan)
    return dx.rolling(n).mean()

def bb_width(df: pd.DataFrame, n: int=20, k: float=2.0) -> pd.Series:
    m = df["close"].rolling(n).mean()
    s = df["close"].rolling(n).std()
    upper = m + k*s
    lower = m - k*s
    return (upper-lower) / m

def add_tf_features(tf_df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    out = pd.DataFrame(index=tf_df.index)
    out[f"{prefix}ema50"] = ema(tf_df["close"], 50)
    out[f"{prefix}ema200"] = ema(tf_df["close"], 200)
    out[f"{prefix}ema200_slope"] = out[f"{prefix}ema200"].diff(5)
    out[f"{prefix}atr14"] = atr(tf_df, 14)
    out[f"{prefix}adx14"] = adx(tf_df, 14)
    out[f"{prefix}bb_width"] = bb_width(tf_df, 20, 2.0)
    # distance to ema200 in ATR units
    out[f"{prefix}dist_ema200_atr"] = (tf_df["close"] - out[f"{prefix}ema200"]) / (out[f"{prefix}atr14"] + 1e-12)
    return out



In [9]:

# ===== 4) Build 15m master dataset with multi-TF context =====
def attach_higher_tf(master_15m: pd.DataFrame, higher_df: pd.DataFrame, higher_feat: pd.DataFrame, tag: str) -> pd.DataFrame:
    # forward-fill last completed higher candle features onto 15m index (no lookahead)
    aligned = higher_feat.reindex(master_15m.index, method="ffill")
    aligned.columns = [f"{tag}_{c}" for c in aligned.columns]
    return master_15m.join(aligned, how="left")

def build_master(bundle: dict) -> pd.DataFrame:
    m15 = bundle["15m"].copy()
    # base features on 15m
    f15 = add_tf_features(m15, prefix="15m_")
    df = m15.join(f15)

    # attach 1h, 4h
    h1 = ensure_tf(bundle,"1h"); f1 = add_tf_features(h1, prefix="1h_")
    h4 = ensure_tf(bundle,"4h"); f4 = add_tf_features(h4, prefix="4h_")
    df = attach_higher_tf(df, h1, f1, "1h")
    df = attach_higher_tf(df, h4, f4, "4h")

    # micro features from 1m if available
    if "1m" in bundle:
        m1 = bundle["1m"]
        # aggregate last 15m window ending at time t (decision time). Use shift(1) to avoid peeking into current forming minute.
        m1r = m1.copy()
        m1r["ret1"] = m1r["close"].pct_change()
        agg = pd.DataFrame(index=df.index)
        # resample to 15m bins aligned to master index
        r = m1r["ret1"].shift(1).resample("15T")
        agg["m1_ret_std"] = r.std()
        agg["m1_ret_mean"]= r.mean()
        # candle anatomy
        m1r["range"] = (m1r["high"]-m1r["low"])
        m1r["lower_wick"] = (np.minimum(m1r["open"], m1r["close"]) - m1r["low"]).clip(lower=0)
        rr = m1r["range"].shift(1).resample("15T")
        lw = m1r["lower_wick"].shift(1).resample("15T")
        agg["m1_range_mean"] = rr.mean()
        agg["m1_lowerwick_ratio"] = (lw.mean() / (rr.mean()+1e-12))
        agg = agg.reindex(df.index)
        df = df.join(agg)
    else:
        df["m1_ret_std"] = np.nan
        df["m1_ret_mean"] = np.nan
        df["m1_range_mean"] = np.nan
        df["m1_lowerwick_ratio"] = np.nan

    # simple 15m returns
    df["r1"]  = df["close"].pct_change(1)
    df["r3"]  = df["close"].pct_change(3)
    df["r6"]  = df["close"].pct_change(6)
    df["r12"] = df["close"].pct_change(12)
    # normalize some distances by 15m ATR
    df["dist_ema50_atr"]  = (df["close"] - df["15m_ema50"]) / (df["15m_atr14"] + 1e-12)
    df["dist_ema200_atr"] = (df["close"] - df["15m_ema200"]) / (df["15m_atr14"] + 1e-12)
    # clean
    return df

train_df = build_master(train_tf)
test_df  = build_master(test_tf)

print("Master built. Columns:", len(train_df.columns))
print("Train date span:", train_df.index.min(), "->", train_df.index.max())
print("Test  date span:", test_df.index.min(), "->", test_df.index.max())


Master built. Columns: 36
Train date span: 2015-01-01 22:00:00+00:00 -> 2023-12-29 21:45:00+00:00
Test  date span: 2024-01-01 22:00:00+00:00 -> 2025-12-30 23:45:00+00:00


In [10]:

# ===== 5) Macro gate + diagnostics (NO silent 0-trade) =====
def macro_gate(df: pd.DataFrame) -> pd.Series:
    # Fill NaNs conservatively: if macro features missing, DO NOT block everything.
    d = df.copy()
    for c in ["4h_dist_ema200_atr","4h_ema200_slope","4h_adx14","4h_bb_width",
              "1h_dist_ema200_atr","1h_adx14"]:
        if c not in d:
            d[c] = np.nan
    # permissive fill: missing -> pass (so we can at least backtest)
    # then you can tighten once you confirm features exist
    dist4 = d["4h_dist_ema200_atr"].fillna(0.1)
    slope4= d["4h_ema200_slope"].fillna(0.0)
    adx4  = d["4h_adx14"].fillna(30)
    bbw4  = d["4h_bb_width"].fillna(0.01)
    dist1 = d["1h_dist_ema200_atr"].fillna(0.0)
    adx1  = d["1h_adx14"].fillna(25)

    ok = (
        (dist4 > 0.0) &
        (slope4 >= 0.0) &
        (adx4 > 16) &
        (dist1 > -0.35) &
        (adx1 > 14) &
        (bbw4 > 0.0005)
    )
    return ok.astype(int)

train_df["macro_ok"] = macro_gate(train_df)
test_df["macro_ok"]  = macro_gate(test_df)

print("macro_ok (train):", train_df["macro_ok"].value_counts().to_dict())
print("macro_ok (test) :", test_df["macro_ok"].value_counts().to_dict())

# Hard guard: if macro_ok is all zeros, automatically relax and warn.
if train_df["macro_ok"].sum() < 100:
    print("\n[WARN] macro_ok too strict or macro features missing. Auto-relaxing to keep pipeline alive.")
    train_df["macro_ok"] = 1
    test_df["macro_ok"]  = 1


macro_ok (train): {1: 224382}
macro_ok (test) : {1: 49807}


In [11]:

# ===== 6) Labels (15m triple-barrier) =====
def triple_barrier_label(df: pd.DataFrame, tp_atr: float, sl_atr: float, max_hold: int) -> pd.Series:
    # decision at t close; entry at t+1 open
    tp = df["open"].shift(-1) + tp_atr * df["15m_atr14"]
    sl = df["open"].shift(-1) - sl_atr * df["15m_atr14"]

    y = np.zeros(len(df), dtype=int)
    for i in range(len(df) - max_hold - 2):
        entry_i = i + 1
        if not np.isfinite(tp.iloc[i]) or not np.isfinite(sl.iloc[i]):
            continue
        hi = df["high"].iloc[entry_i: entry_i + max_hold]
        lo = df["low"].iloc[entry_i: entry_i + max_hold]
        hit_tp = np.where(hi.values >= tp.iloc[i])[0]
        hit_sl = np.where(lo.values <= sl.iloc[i])[0]
        if len(hit_tp)==0 and len(hit_sl)==0:
            y[i]=0
            continue
        # worst-case ambiguity on same bar: assume SL first (conservative PF)
        tp_first = hit_tp[0] if len(hit_tp)>0 else 10**9
        sl_first = hit_sl[0] if len(hit_sl)>0 else 10**9
        y[i] = 1 if tp_first < sl_first else 0
    return pd.Series(y, index=df.index)

train_df["y"] = triple_barrier_label(train_df, cfg.tp_atr, cfg.sl_atr, cfg.max_hold_bars)
test_df["y"]  = triple_barrier_label(test_df, cfg.tp_atr, cfg.sl_atr, cfg.max_hold_bars)

print("Label balance (train macro_ok==1):", train_df.loc[train_df["macro_ok"]==1,"y"].value_counts(normalize=True).to_dict())


Label balance (train macro_ok==1): {0: 0.6714888003494042, 1: 0.3285111996505959}


In [12]:

# ===== 7) Train models (RF macro optional, XGB edge, LR entry) + Calibrate =====
def split_core_val(df: pd.DataFrame):
    core = df.loc[:cfg.core_end].copy()
    val  = df.loc[cfg.core_end:cfg.val_end].copy()
    # avoid overlap at boundary
    val = val.iloc[1:].copy()
    return core, val

FEATURES_EDGE = [
    "r1","r3","r6","r12",
    "15m_adx14","15m_bb_width","15m_dist_ema200_atr","dist_ema50_atr",
    "1h_1h_dist_ema200_atr","1h_1h_adx14","4h_4h_dist_ema200_atr","4h_4h_adx14","4h_4h_bb_width",
]
FEATURES_ENTRY = [
    "m1_ret_std","m1_ret_mean","m1_range_mean","m1_lowerwick_ratio",
    "15m_bb_width","15m_adx14"
]

def safe_matrix(df: pd.DataFrame, cols):
    X = df[cols].copy()
    # fill NaNs robustly
    X = X.replace([np.inf,-np.inf], np.nan)
    X = X.fillna(method="ffill").fillna(method="bfill").fillna(0.0)
    return X

core, val = split_core_val(train_df)
core = core[core["macro_ok"]==1].copy()
val  = val[val["macro_ok"]==1].copy()

# if too few samples, relax (avoid "no result")
if len(core) < 2000:
    print("[WARN] Too few macro_ok samples in core. Forcing macro_ok=1 for training.")
    core, val = split_core_val(train_df)

Xc = safe_matrix(core, FEATURES_EDGE)
yc = core["y"].astype(int)
Xv = safe_matrix(val, FEATURES_EDGE)
yv = val["y"].astype(int)

# Edge model
if HAS_XGB:
    edge = XGBClassifier(
        n_estimators=600,
        max_depth=4,
        learning_rate=0.04,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        min_child_weight=5,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
    )
else:
    edge = RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        min_samples_leaf=50,
        random_state=42,
        n_jobs=-1
    )

edge.fit(Xc, yc)
p_val_raw = edge.predict_proba(Xv)[:,1]

# calibration helps threshold stability
cal = CalibratedClassifierCV(edge, method="sigmoid", cv=3)
cal.fit(Xc, yc)
p_val = cal.predict_proba(Xv)[:,1]
print("Edge AUC raw:", roc_auc_score(yv, p_val_raw), "cal:", roc_auc_score(yv, p_val))

# Entry model (use label = y as proxy; you can replace with adverse-move label later)
Xe_c = safe_matrix(core, FEATURES_ENTRY)
Xe_v = safe_matrix(val, FEATURES_ENTRY)
entry = LogisticRegression(max_iter=2000, C=0.5)
entry.fit(Xe_c, yc)
p_entry_val = entry.predict_proba(Xe_v)[:,1]
print("Entry AUC:", roc_auc_score(yv, p_entry_val))


Edge AUC raw: 0.580923115569892 cal: 0.5821072031011185
Entry AUC: 0.5151619389316484


In [13]:

# ===== 8) Backtest engine (truthful) + PF-first threshold tuning =====
def pip_to_price(pips: float) -> float:
    return pips * 0.0001  # EURUSD-style. If you trade JPY pairs, adjust.

def compute_lot_size(equity: float, entry: float, sl: float, risk: float) -> float:
    # 1 lot = 100,000 base; pip value approx $10 per pip for EURUSD per lot
    # risk_amount = equity*risk ; stop_pips = |entry-sl|/0.0001 ; lot = risk_amount / (stop_pips*10)
    risk_amount = equity * risk
    stop_pips = abs(entry - sl) / 0.0001
    if stop_pips <= 0:
        return 0.0
    lots = risk_amount / (stop_pips * 10.0)
    return max(0.0, lots)

def run_backtest(df: pd.DataFrame, p_edge: np.ndarray, p_entry: np.ndarray,
                 EDGE_TH: float, ENTRY_TH: float, tp_atr: float, sl_atr: float,
                 max_hold: int, equity0: float=10_000.0):
    spread = pip_to_price(cfg.spread_pips)
    slip   = pip_to_price(cfg.slippage_pips)

    eq = equity0
    equity_curve = []
    trades = []
    in_pos = False
    exit_i = -1

    for i in range(len(df)-max_hold-2):
        equity_curve.append((df.index[i], eq))
        if in_pos and i < exit_i:
            continue
        in_pos = False

        if df.iloc[i]["macro_ok"] != 1:
            continue
        if not (p_edge[i] > EDGE_TH and p_entry[i] > ENTRY_TH):
            continue

        # decision at t close, entry at t+1 open with costs
        entry_i = i + 1
        entry_mid = df["open"].iloc[entry_i]
        entry = entry_mid + spread/2 + slip

        atrv = df["15m_atr14"].iloc[i]
        if not np.isfinite(atrv) or atrv <= 0:
            continue
        tp = entry_mid + tp_atr * atrv
        sl = entry_mid - sl_atr * atrv

        lots = compute_lot_size(eq, entry, sl, cfg.risk_per_trade)
        if lots <= 0:
            continue

        # walk forward up to max_hold bars
        hi = df["high"].iloc[entry_i: entry_i+max_hold]
        lo = df["low"].iloc[entry_i: entry_i+max_hold]
        # conservative: if both hit same bar -> SL first
        hit_tp = np.where(hi.values >= tp)[0]
        hit_sl = np.where(lo.values <= sl)[0]
        if len(hit_tp)==0 and len(hit_sl)==0:
            # time exit at last bar close
            ex_i = entry_i + max_hold - 1
            exit_mid = df["close"].iloc[ex_i]
            exit_px = exit_mid - spread/2 - slip
            pnl_pips = (exit_px - entry) / 0.0001
            reason = "time"
        else:
            tp_first = hit_tp[0] if len(hit_tp)>0 else 10**9
            sl_first = hit_sl[0] if len(hit_sl)>0 else 10**9
            if tp_first < sl_first:
                ex_i = entry_i + tp_first
                exit_mid = tp
                exit_px = exit_mid - spread/2 - slip
                pnl_pips = (exit_px - entry) / 0.0001
                reason="tp"
            else:
                ex_i = entry_i + sl_first
                exit_mid = sl
                exit_px = exit_mid - spread/2 - slip
                pnl_pips = (exit_px - entry) / 0.0001
                reason="sl"

        pnl_usd = pnl_pips * 10.0 * lots
        pnl_usd -= cfg.commission_per_lot_usd * lots
        eq += pnl_usd

        trades.append({
            "entry_time": df.index[entry_i],
            "exit_time": df.index[ex_i],
            "entry": float(entry),
            "exit": float(exit_px),
            "lots": float(lots),
            "pnl_usd": float(pnl_usd),
            "reason": reason
        })
        in_pos = True
        exit_i = ex_i

    equity_curve.append((df.index[min(len(df)-1, len(df)-1)], eq))
    eq_df = pd.DataFrame(equity_curve, columns=["time","equity"]).set_index("time")
    tr = pd.DataFrame(trades)
    return tr, eq_df

def profit_factor(trades: pd.DataFrame) -> float:
    if trades is None or len(trades)==0:
        return 0.0
    g = trades.loc[trades["pnl_usd"]>0, "pnl_usd"].sum()
    l = -trades.loc[trades["pnl_usd"]<0, "pnl_usd"].sum()
    return float(g/l) if l>0 else float("inf")

def max_drawdown(eq: pd.Series) -> float:
    peak = eq.cummax()
    dd = (eq - peak) / peak
    return float(dd.min())

# Build validation predictions aligned to val index
p_edge_val = cal.predict_proba(safe_matrix(val, FEATURES_EDGE))[:,1]
p_ent_val  = entry.predict_proba(safe_matrix(val, FEATURES_ENTRY))[:,1]

# PF-first tuning: search thresholds + TP/SL around defaults
grid_edge = np.linspace(0.55, 0.75, 9)
grid_ent  = np.linspace(0.50, 0.70, 9)
grid_tp   = [1.8, 2.0, 2.2, 2.4]
grid_sl   = [1.2, 1.4, 1.5, 1.6]

best = None
for et in grid_edge:
    for it in grid_ent:
        for tp in grid_tp:
            for sl in grid_sl:
                tr, eq = run_backtest(val, p_edge_val, p_ent_val, et, it, tp, sl, cfg.max_hold_bars)
                if len(tr) < 80:  # avoid ultra few trades
                    continue
                pf = profit_factor(tr)
                dd = max_drawdown(eq["equity"])
                if dd < -0.20:  # dd constraint
                    continue
                score = pf
                cand = (score, pf, dd, len(tr), et, it, tp, sl)
                if best is None or cand[0] > best[0]:
                    best = cand

if best is None:
    print("[WARN] No parameter set met constraints. We'll use defaults (may trade less).")
    EDGE_TH, ENTRY_TH, TP_ATR, SL_ATR = cfg.edge_th, cfg.entry_th, cfg.tp_atr, cfg.sl_atr
else:
    _, pf, dd, ntr, EDGE_TH, ENTRY_TH, TP_ATR, SL_ATR = best
    print("BEST on VAL:", {"PF":pf, "MaxDD":dd, "Trades":ntr, "EDGE_TH":EDGE_TH, "ENTRY_TH":ENTRY_TH, "TP_ATR":TP_ATR, "SL_ATR":SL_ATR})



[WARN] No parameter set met constraints. We'll use defaults (may trade less).


In [14]:

# ===== 9) Final train on full train (macro_ok==1), then TEST backtest =====
train_ok = train_df[train_df["macro_ok"]==1].copy()
if len(train_ok) < 2000:
    train_ok = train_df.copy()

X_train = safe_matrix(train_ok, FEATURES_EDGE)
y_train = train_ok["y"].astype(int)

# re-fit base edge then calibrate
if HAS_XGB:
    edge2 = XGBClassifier(
        n_estimators=900,
        max_depth=4,
        learning_rate=0.03,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        min_child_weight=5,
        objective="binary:logistic",
        eval_metric="logloss",
        tree_method="hist",
        random_state=42,
    )
else:
    edge2 = RandomForestClassifier(n_estimators=800, max_depth=10, min_samples_leaf=30, random_state=42, n_jobs=-1)

edge2.fit(X_train, y_train)
cal2 = CalibratedClassifierCV(edge2, method="sigmoid", cv=3)
cal2.fit(X_train, y_train)

entry2 = LogisticRegression(max_iter=2000, C=0.5)
entry2.fit(safe_matrix(train_ok, FEATURES_ENTRY), y_train)

p_edge_test = cal2.predict_proba(safe_matrix(test_df, FEATURES_EDGE))[:,1]
p_ent_test  = entry2.predict_proba(safe_matrix(test_df, FEATURES_ENTRY))[:,1]

tr_test, eq_test = run_backtest(test_df, p_edge_test, p_ent_test, EDGE_TH, ENTRY_TH, TP_ATR, SL_ATR, cfg.max_hold_bars)

pf_test = profit_factor(tr_test)
dd_test = max_drawdown(eq_test["equity"])
print("TEST RESULTS:", {"ProfitFactor": pf_test, "Trades": int(len(tr_test)), "MaxDD": dd_test, "FinalEquity": float(eq_test['equity'].iloc[-1])})

tr_test.head(), tr_test.tail()


TEST RESULTS: {'ProfitFactor': 0.0, 'Trades': 0, 'MaxDD': 0.0, 'FinalEquity': 10000.0}


(Empty DataFrame
 Columns: []
 Index: [],
 Empty DataFrame
 Columns: []
 Index: [])

In [15]:

# ===== 10) Quick sanity checks (why 0 trades?) =====
def explain_zero_trades(df: pd.DataFrame, p_edge: np.ndarray, p_ent: np.ndarray, EDGE_TH: float, ENTRY_TH: float):
    c_macro = int((df["macro_ok"]==1).sum())
    c_edge  = int((p_edge > EDGE_TH).sum())
    c_ent   = int((p_ent > ENTRY_TH).sum())
    c_both  = int(((df["macro_ok"]==1) & (p_edge>EDGE_TH) & (p_ent>ENTRY_TH)).sum())
    print("Candidates breakdown:")
    print("  macro_ok==1:", c_macro)
    print("  p_edge > TH:", c_edge)
    print("  p_entry> TH:", c_ent)
    print("  all conditions:", c_both)

explain_zero_trades(test_df, p_edge_test, p_ent_test, EDGE_TH, ENTRY_TH)

# If you still see 0, temporarily drop thresholds to verify engine works:
# tr_dbg, _ = run_backtest(test_df, p_edge_test, p_ent_test, 0.50, 0.50, TP_ATR, SL_ATR, cfg.max_hold_bars)
# print("DEBUG trades:", len(tr_dbg), "PF:", profit_factor(tr_dbg))


Candidates breakdown:
  macro_ok==1: 49807
  p_edge > TH: 0
  p_entry> TH: 0
  all conditions: 0
