# forex_signal_v26 — PF-first Multi-Timeframe Hierarchical Ensemble
Энэ notebook нь:
- **4H + 1H Macro Gate** (LONG зөвшөөрөх эсэх)
- **15m Edge Model (XGBoost)** (TP хүрэх магадлал)
- **1m Entry Quality Model (LogReg)** (муу entry-г шүүх)
- **Next-open execution**, spread/slippage, 1% risk sizing, triple-barrier label

Дата бүтэц:
- `data/train/` → 2015–2023
- `data/test/` → 2024–2025

Timeframe файлуудыг `*1m*.csv`, `*15m*.csv`, `*1h*.csv`, `*4h*.csv` гэх мэт нэрээр байрлуулсан гэж таамаглана.


In [4]:
# ===== 0) Imports & Global Config =====
import os, re, glob, math
import numpy as np
import pandas as pd

from dataclasses import dataclass

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

pd.set_option("display.max_columns", 200)
np.set_printoptions(suppress=True)

SEED = 42
np.random.seed(SEED)

@dataclass
class Config:
    # Adjusted paths to step up from ml_models_train/ to root
    train_dir: str = "../data/train"
    test_dir: str = "../data/test"

    trade_tf: str = "15m"
    macro_tfs: tuple = ("1h", "4h")
    entry_tf: str = "1m"

    # Validation split inside train
    train_core_end: str = "2021-12-31"
    val_start: str = "2022-01-01"
    val_end: str = "2023-12-31"

    # Triple barrier defaults (tune on validation)
    atr_period: int = 14
    tp_atr: float = 2.0
    sl_atr: float = 1.5
    max_hold_bars: int = 16  # 16 * 15m = 4h

    # Execution / costs
    starting_equity: float = 10_000.0
    risk_per_trade: float = 0.01  # 1%
    spread_pips: float = 1.2      # change to your broker typical
    slippage_pips: float = 0.2
    commission_per_lot_usd: float = 0.0  # set if needed

    # Decision thresholds (tune on validation)
    edge_th: float = 0.60
    entry_th: float = 0.55

    # Forex pip conventions (EURUSD)
    pip_size: float = 0.0001
    lot_size: float = 100_000  # 1.0 lot = 100k base
    usd_per_pip_per_lot: float = 10.0  # approx for EURUSD (USD quote)

CFG = Config()
print(CFG)

Config(train_dir='../data/train', test_dir='../data/test', trade_tf='15m', macro_tfs=('1h', '4h'), entry_tf='1m', train_core_end='2021-12-31', val_start='2022-01-01', val_end='2023-12-31', atr_period=14, tp_atr=2.0, sl_atr=1.5, max_hold_bars=16, starting_equity=10000.0, risk_per_trade=0.01, spread_pips=1.2, slippage_pips=0.2, commission_per_lot_usd=0.0, edge_th=0.6, entry_th=0.55, pip_size=0.0001, lot_size=100000, usd_per_pip_per_lot=10.0)


In [5]:
# ===== 1) Data Loading (robust to filenames) =====
def _find_csv_by_tf(folder: str, tf: str) -> str:
    # Accept patterns like *15m*.csv, *15min*.csv, etc.
    patterns = [
        f"*{tf}*.csv",
        f"*{tf.replace('m','min')}*.csv",
        f"*{tf.upper()}*.csv",
        f"*{tf.lower()}*.csv",
    ]
    
    # Add reversed pattern for filenames like EURUSD_m15.csv (tf="15m")
    if len(tf) > 1 and tf[-1].isalpha() and tf[:-1].isdigit():
        rev = f"{tf[-1]}{tf[:-1]}" # 15m -> m15
        patterns.append(f"*{rev}*.csv")
        patterns.append(f"*{rev.upper()}*.csv")
        patterns.append(f"*{rev.lower()}*.csv")
        
    files = []
    for p in patterns:
        files.extend(glob.glob(os.path.join(folder, p)))
    files = sorted(set(files))
    if not files:
        raise FileNotFoundError(f"No CSV found for tf='{tf}' in {folder}. Looked for patterns: {patterns}")
    # Prefer the shortest name (often the intended one)
    files = sorted(files, key=lambda x: len(os.path.basename(x)))
    return files[0]

def load_ohlcv(folder: str, tf: str) -> pd.DataFrame:
    path = _find_csv_by_tf(folder, tf)
    df = pd.read_csv(path)

    # Heuristic column mapping
    cols = {c.lower(): c for c in df.columns}
    # timestamp column
    ts_col = None
    for k in ["timestamp", "time", "date", "datetime"]:
        if k in cols:
            ts_col = cols[k]
            break
    if ts_col is None:
        raise ValueError(f"Could not find timestamp column in {path}. Columns: {df.columns.tolist()}")

    # OHLC
    def pick(names):
        for n in names:
            if n in cols:
                return cols[n]
        return None

    o = pick(["open","o"])
    h = pick(["high","h"])
    l = pick(["low","l"])
    c = pick(["close","c"])
    v = pick(["volume","vol","tick_volume"])
    required = [o,h,l,c]
    if any(x is None for x in required):
        raise ValueError(f"Missing OHLC in {path}. Columns: {df.columns.tolist()}")

    out = df[[ts_col, o,h,l,c] + ([v] if v is not None else [])].copy()
    out.columns = ["timestamp","open","high","low","close"] + (["volume"] if v is not None else [])
    out["timestamp"] = pd.to_datetime(out["timestamp"], utc=True, errors="coerce")
    out = out.dropna(subset=["timestamp"]).sort_values("timestamp").reset_index(drop=True)

    # Make sure numeric
    for col in ["open","high","low","close"] + (["volume"] if "volume" in out.columns else []):
        out[col] = pd.to_numeric(out[col], errors="coerce")
    out = out.dropna(subset=["open","high","low","close"]).reset_index(drop=True)

    out.attrs["source_path"] = path
    return out

def load_all_timeframes(base_dir: str) -> dict:
    data = {}
    # We load only what this architecture uses; extend if you want.
    for tf in set([CFG.trade_tf, CFG.entry_tf, *CFG.macro_tfs]):
        data[tf] = load_ohlcv(base_dir, tf)
        print(f"Loaded {tf} from {data[tf].attrs.get('source_path')} rows={len(data[tf]):,}")
    return data

train_raw = load_all_timeframes(CFG.train_dir)
test_raw  = load_all_timeframes(CFG.test_dir)

Loaded 1m from ../data/train\EURUSD_m1.csv rows=3,354,904
Loaded 4h from ../data/train\EURUSD_h4.csv rows=14,498
Loaded 15m from ../data/train\EURUSD_m15.csv rows=224,382
Loaded 1h from ../data/train\EURUSD_h1.csv rows=56,098
Loaded 1m from ../data/test\EURUSD_m1.csv rows=743,476
Loaded 4h from ../data/test\EURUSD_h4.csv rows=3,220
Loaded 15m from ../data/test\EURUSD_m15.csv rows=49,807
Loaded 1h from ../data/test\EURUSD_h1.csv rows=12,454


In [6]:
# ===== 2) Technical Indicators (pure pandas, no TA-Lib) =====
def ema(s: pd.Series, span: int) -> pd.Series:
    return s.ewm(span=span, adjust=False).mean()

def rsi(close: pd.Series, period: int = 14) -> pd.Series:
    delta = close.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.ewm(alpha=1/period, adjust=False).mean()
    ma_down = down.ewm(alpha=1/period, adjust=False).mean()
    rs = ma_up / (ma_down.replace(0, np.nan))
    return 100 - (100 / (1 + rs))

def true_range(df: pd.DataFrame) -> pd.Series:
    prev_close = df["close"].shift(1)
    tr = pd.concat([
        (df["high"] - df["low"]),
        (df["high"] - prev_close).abs(),
        (df["low"]  - prev_close).abs(),
    ], axis=1).max(axis=1)
    return tr

def atr(df: pd.DataFrame, period: int = 14) -> pd.Series:
    tr = true_range(df)
    return tr.ewm(alpha=1/period, adjust=False).mean()

def adx(df: pd.DataFrame, period: int = 14) -> pd.Series:
    high, low, close = df["high"], df["low"], df["close"]
    up_move = high.diff()
    down_move = -low.diff()
    plus_dm = np.where((up_move > down_move) & (up_move > 0), up_move, 0.0)
    minus_dm = np.where((down_move > up_move) & (down_move > 0), down_move, 0.0)
    tr = true_range(df)
    atr_ = tr.ewm(alpha=1/period, adjust=False).mean()
    plus_di = 100 * pd.Series(plus_dm, index=df.index).ewm(alpha=1/period, adjust=False).mean() / (atr_.replace(0, np.nan))
    minus_di = 100 * pd.Series(minus_dm, index=df.index).ewm(alpha=1/period, adjust=False).mean() / (atr_.replace(0, np.nan))
    dx = (100 * (plus_di - minus_di).abs() / ((plus_di + minus_di).replace(0, np.nan)))
    return dx.ewm(alpha=1/period, adjust=False).mean()

def macd_hist(close: pd.Series, fast: int=12, slow: int=26, signal: int=9) -> pd.Series:
    macd = ema(close, fast) - ema(close, slow)
    sig = ema(macd, signal)
    return macd - sig

def bb_width(close: pd.Series, period: int=20, n_std: float=2.0) -> pd.Series:
    ma = close.rolling(period).mean()
    sd = close.rolling(period).std()
    upper = ma + n_std * sd
    lower = ma - n_std * sd
    width = (upper - lower) / (ma.replace(0, np.nan))
    return width


In [7]:
# ===== 3) Feature Builders =====
def add_macro_features(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    out = df.copy()
    out[f"{prefix}_ema200"] = ema(out["close"], 200)
    out[f"{prefix}_ema50"]  = ema(out["close"], 50)
    out[f"{prefix}_ema200_slope"] = out[f"{prefix}_ema200"].diff(3)
    out[f"{prefix}_adx14"] = adx(out, 14)
    out[f"{prefix}_atr14"] = atr(out, CFG.atr_period)
    out[f"{prefix}_bb_width"] = bb_width(out["close"], 20, 2.0)
    # Normalize distances by ATR
    out[f"{prefix}_dist_ema200_atr"] = (out["close"] - out[f"{prefix}_ema200"]) / (out[f"{prefix}_atr14"].replace(0, np.nan))
    return out

def add_trade_features_15m(df15: pd.DataFrame) -> pd.DataFrame:
    out = df15.copy()
    out["ret1"] = np.log(out["close"]).diff()
    out["ret3"] = out["ret1"].rolling(3).sum()
    out["ret6"] = out["ret1"].rolling(6).sum()
    out["ret12"] = out["ret1"].rolling(12).sum()

    out["ema20"] = ema(out["close"], 20)
    out["ema50"] = ema(out["close"], 50)
    out["ema200"] = ema(out["close"], 200)
    out["ema50_slope"] = out["ema50"].diff(3)
    out["rsi14"] = rsi(out["close"], 14)
    out["macd_hist"] = macd_hist(out["close"], 12, 26, 9)
    out["atr14"] = atr(out, CFG.atr_period)
    out["bb_width"] = bb_width(out["close"], 20, 2.0)

    out["dist_ema50_atr"] = (out["close"] - out["ema50"]) / (out["atr14"].replace(0, np.nan))
    out["dist_ema200_atr"] = (out["close"] - out["ema200"]) / (out["atr14"].replace(0, np.nan))
    out["range_atr"] = (out["high"] - out["low"]) / (out["atr14"].replace(0, np.nan))
    out["body_atr"] = (out["close"] - out["open"]).abs() / (out["atr14"].replace(0, np.nan))
    return out

def build_entry_agg_1m(df1m: pd.DataFrame) -> pd.DataFrame:
    # Create 15m buckets based on timestamp; assume timestamp is candle open.
    d = df1m.copy().set_index("timestamp")
    # Aggregate within each 15m interval
    agg = pd.DataFrame({
        "m1_ret_mean": np.log(d["close"]).diff().resample("15min").mean(),
        "m1_ret_std":  np.log(d["close"]).diff().resample("15min").std(),
        "m1_range_mean": (d["high"]-d["low"]).resample("15min").mean(),
        "m1_range_max":  (d["high"]-d["low"]).resample("15min").max(),
        "m1_wick_down_mean": ((d[["open","close"]].min(axis=1) - d["low"]) / (d["high"]-d["low"]).replace(0,np.nan)).resample("15min").mean(),
        "m1_wick_up_mean":   ((d["high"] - d[["open","close"]].max(axis=1)) / (d["high"]-d["low"]).replace(0,np.nan)).resample("15min").mean(),
    }).reset_index()
    # IMPORTANT: shift by 1 bucket so 15m row at time t only uses completed previous 15m window
    for col in agg.columns:
        if col != "timestamp":
            agg[col] = agg[col].shift(1)
    return agg

def asof_merge_left(base: pd.DataFrame, other: pd.DataFrame, suffix: str, on: str="timestamp") -> pd.DataFrame:
    # Merge last known macro candle that started <= base timestamp
    left = base.sort_values(on).copy()
    right = other.sort_values(on).copy()
    merged = pd.merge_asof(left, right, on=on, direction="backward", suffixes=("", suffix))
    return merged

def build_dataset(raw: dict) -> pd.DataFrame:
    df15 = raw[CFG.trade_tf].copy()
    df15 = add_trade_features_15m(df15)

    # Macro merges
    for tf in CFG.macro_tfs:
        m = add_macro_features(raw[tf], prefix=tf)
        # Keep only macro feature columns + timestamp
        keep = ["timestamp"] + [c for c in m.columns if c.startswith(tf+"_")]
        df15 = asof_merge_left(df15, m[keep], suffix=f"_{tf}")

    # Entry aggregation
    entry_agg = build_entry_agg_1m(raw[CFG.entry_tf])
    df15 = pd.merge(df15, entry_agg, on="timestamp", how="left")

    # Basic cleanup
    df15 = df15.replace([np.inf, -np.inf], np.nan)
    return df15

train_df = build_dataset(train_raw)
test_df  = build_dataset(test_raw)

print("train_df", train_df.shape, "test_df", test_df.shape)
train_df.head(3)


train_df (224382, 42) test_df (49807, 42)


Unnamed: 0,timestamp,open,high,low,close,volume,ret1,ret3,ret6,ret12,ema20,ema50,ema200,ema50_slope,rsi14,macd_hist,atr14,bb_width,dist_ema50_atr,dist_ema200_atr,range_atr,body_atr,1h_ema200,1h_ema50,1h_ema200_slope,1h_adx14,1h_atr14,1h_bb_width,1h_dist_ema200_atr,4h_ema200,4h_ema50,4h_ema200_slope,4h_adx14,4h_atr14,4h_bb_width,4h_dist_ema200_atr,m1_ret_mean,m1_ret_std,m1_range_mean,m1_range_max,m1_wick_down_mean,m1_wick_up_mean
0,2015-01-01 22:00:00+00:00,1.21038,1.21064,1.21025,1.21035,244.3,,,,,1.21035,1.21035,1.21035,,,0.0,0.00039,,0.0,0.0,1.0,0.076923,1.21014,1.21014,,,0.00074,,0.0,1.20869,1.20869,,,0.0029,,0.0,,,,,,
1,2015-01-01 22:15:00+00:00,1.21038,1.21073,1.20999,1.21016,360.07,-0.000157,,,,1.210332,1.210343,1.210348,,0.0,-1.2e-05,0.000415,,-0.439877,-0.453276,1.783133,0.53012,1.21014,1.21014,,,0.00074,,0.0,1.20869,1.20869,,,0.0029,,0.0,-1e-06,4.5e-05,0.000113,0.00029,0.200998,0.195416
2,2015-01-01 22:30:00+00:00,1.2102,1.21039,1.2101,1.21039,238.65,0.00019,,,,1.210337,1.210344,1.210349,,8.518519,-4e-06,0.000406,,0.112271,0.102134,0.71416,0.467898,1.21014,1.21014,,,0.00074,,0.0,1.20869,1.20869,,,0.0029,,0.0,-1e-05,6e-05,0.000109,0.0004,0.151082,0.188161


In [8]:
# ===== 4) Macro Gate (Rule-based, PF-first) =====
def macro_gate(row) -> int:
    # Use 4H and 1H context; conservative defaults.
    ok = True

    # 4h trend & strength
    if not (row.get("4h_dist_ema200_atr") is not None):
        return 0

    ok &= (row["4h_dist_ema200_atr"] > 0)  # price above EMA200
    ok &= (row["4h_ema200_slope"] > 0)
    ok &= (row["4h_adx14"] > 18)

    # 1h confirmation
    ok &= (row["1h_dist_ema200_atr"] > 0)
    ok &= (row["1h_ema200_slope"] > 0)

    # Volatility sanity (avoid extreme squeeze/chaos)
    # ATR percentile proxy: use BB width + range_atr (15m)
    ok &= (row["bb_width"] > 0)  # avoid NaN early rows
    return int(bool(ok))

for df in [train_df, test_df]:
    df["macro_ok"] = df.apply(macro_gate, axis=1)

train_df[["timestamp","macro_ok"]].tail()


Unnamed: 0,timestamp,macro_ok
224377,2023-12-29 20:45:00+00:00,1
224378,2023-12-29 21:00:00+00:00,1
224379,2023-12-29 21:15:00+00:00,1
224380,2023-12-29 21:30:00+00:00,1
224381,2023-12-29 21:45:00+00:00,1


In [9]:
# ===== 5) Triple-Barrier Label on 15m (entry at next open) =====
def triple_barrier_labels(df: pd.DataFrame) -> pd.Series:
    # Label computed on the SAME dataframe order (15m)
    n = len(df)
    label = np.full(n, np.nan)

    atr_col = "atr14"
    for i in range(n - (CFG.max_hold_bars + 2)):
        if np.isnan(df.loc[i, atr_col]):
            continue

        entry_idx = i + 1
        entry = df.loc[entry_idx, "open"]
        atr_i = df.loc[i, atr_col]
        tp = entry + CFG.tp_atr * atr_i
        sl = entry - CFG.sl_atr * atr_i

        end = min(n-1, entry_idx + CFG.max_hold_bars)
        hit = 0  # default loss/time-out

        for j in range(entry_idx, end+1):
            hi = df.loc[j, "high"]
            lo = df.loc[j, "low"]
            # If both hit in same bar, assume worst-case for long (SL first)
            if (lo <= sl) and (hi >= tp):
                hit = 0
                break
            if lo <= sl:
                hit = 0
                break
            if hi >= tp:
                hit = 1
                break
        label[i] = hit

    return pd.Series(label, index=df.index, name="y_edge")

# Compute labels on train only (no leaking test)
train_df["y_edge"] = triple_barrier_labels(train_df)
train_df[["timestamp","y_edge","macro_ok"]].dropna().head()


Unnamed: 0,timestamp,y_edge,macro_ok
0,2015-01-01 22:00:00+00:00,0.0,0
1,2015-01-01 22:15:00+00:00,0.0,0
2,2015-01-01 22:30:00+00:00,0.0,0
3,2015-01-01 22:45:00+00:00,0.0,0
4,2015-01-01 23:00:00+00:00,0.0,0


In [10]:
# ===== 6) Entry Quality Labels (1m based) =====
# Label: after entry (next 15m open), within next 5 minutes,
# does price move against us more than X pips? (bad entry = 1)
# We'll model p_entry_good (good entry) => y_entry_good in {0,1}

def build_entry_quality_labels(df15: pd.DataFrame, df1m: pd.DataFrame, adverse_pips: float = 3.0, window_min: int = 5) -> pd.Series:
    d1 = df1m.copy().set_index("timestamp").sort_index()
    adverse = adverse_pips * CFG.pip_size

    y = np.full(len(df15), np.nan)
    for i in range(len(df15)-2):
        entry_time = df15.loc[i+1, "timestamp"]
        entry_price = df15.loc[i+1, "open"]
        t0 = entry_time
        t1 = entry_time + pd.Timedelta(minutes=window_min)
        window = d1.loc[t0:t1]
        if len(window) < 2:
            continue
        # worst adverse move for long: min low - entry
        min_low = window["low"].min()
        bad = (entry_price - min_low) >= adverse
        y[i] = 0 if bad else 1  # 1=good entry
    return pd.Series(y, index=df15.index, name="y_entry_good")

train_df["y_entry_good"] = build_entry_quality_labels(train_df, train_raw[CFG.entry_tf])
train_df[["timestamp","y_entry_good"]].dropna().head()


Unnamed: 0,timestamp,y_entry_good
0,2015-01-01 22:00:00+00:00,0.0
1,2015-01-01 22:15:00+00:00,1.0
2,2015-01-01 22:30:00+00:00,1.0
3,2015-01-01 22:45:00+00:00,1.0
4,2015-01-01 23:00:00+00:00,1.0


In [11]:
# ===== 7) Train/Validation Split (time-based) =====
def split_train_val(df: pd.DataFrame):
    df = df.copy()
    df = df.dropna(subset=["y_edge","y_entry_good"]).reset_index(drop=True)
    df["date"] = df["timestamp"].dt.date

    train_core = df[df["timestamp"] <= pd.Timestamp(CFG.train_core_end, tz="UTC")].copy()
    val = df[(df["timestamp"] >= pd.Timestamp(CFG.val_start, tz="UTC")) & (df["timestamp"] <= pd.Timestamp(CFG.val_end, tz="UTC"))].copy()
    return train_core, val

train_core, val = split_train_val(train_df)
print("train_core", train_core.shape, "val", val.shape)
print(train_core["timestamp"].min(), train_core["timestamp"].max())
print(val["timestamp"].min(), val["timestamp"].max())


train_core (174400, 46) val (49812, 46)
2015-01-01 22:00:00+00:00 2021-12-31 00:00:00+00:00
2022-01-02 22:00:00+00:00 2023-12-29 17:15:00+00:00


In [12]:
# ===== 8) Feature Sets =====
# We keep a compact but diverse set (PF-first, avoid redundancy).
EDGE_FEATURES = [
    # 15m core
    "ret1","ret3","ret6","ret12",
    "ema50_slope","rsi14","macd_hist","bb_width",
    "dist_ema50_atr","dist_ema200_atr","range_atr","body_atr",
    # macro context (1h/4h)
    "1h_dist_ema200_atr","1h_ema200_slope","1h_adx14","1h_bb_width",
    "4h_dist_ema200_atr","4h_ema200_slope","4h_adx14","4h_bb_width",
    # costs proxies / micro info
    "m1_ret_std","m1_range_max",
    "macro_ok",  # gating info as feature too
]

ENTRY_FEATURES = [
    "m1_ret_mean","m1_ret_std","m1_range_mean","m1_range_max",
    "m1_wick_down_mean","m1_wick_up_mean",
    "range_atr","bb_width",
]

def make_xy(df: pd.DataFrame, features: list, ycol: str):
    X = df[features].copy()
    y = df[ycol].astype(int).copy()
    # Replace inf/nan
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(method="ffill").fillna(0)
    return X, y

Xtr_edge, ytr_edge = make_xy(train_core, EDGE_FEATURES, "y_edge")
Xva_edge, yva_edge = make_xy(val, EDGE_FEATURES, "y_edge")

Xtr_entry, ytr_entry = make_xy(train_core, ENTRY_FEATURES, "y_entry_good")
Xva_entry, yva_entry = make_xy(val, ENTRY_FEATURES, "y_entry_good")

print(Xtr_edge.shape, Xtr_entry.shape)


(174400, 23) (174400, 8)


  X = X.fillna(method="ffill").fillna(0)
  X = X.fillna(method="ffill").fillna(0)
  X = X.fillna(method="ffill").fillna(0)
  X = X.fillna(method="ffill").fillna(0)


In [13]:
# ===== 9) Train Models =====
# 9.1 Edge model (XGBoost)
edge_model = xgb.XGBClassifier(
    n_estimators=600,
    max_depth=4,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    min_child_weight=4,
    gamma=0.0,
    random_state=SEED,
    n_jobs=-1,
    eval_metric="logloss",
)
edge_model.fit(Xtr_edge, ytr_edge)

pva_edge = edge_model.predict_proba(Xva_edge)[:,1]
auc_edge = roc_auc_score(yva_edge, pva_edge)
print("Edge model AUC (val):", round(auc_edge, 4))

# 9.2 Entry model (Logistic Regression with scaling)
entry_model = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression(max_iter=500, random_state=SEED))
])
entry_model.fit(Xtr_entry, ytr_entry)
pva_entry = entry_model.predict_proba(Xva_entry)[:,1]
auc_entry = roc_auc_score(yva_entry, pva_entry)
print("Entry model AUC (val):", round(auc_entry, 4))


Edge model AUC (val): 0.6658
Entry model AUC (val): 0.7488


In [14]:
# ===== 10) Threshold Tuning on Validation (PF-first) =====
# We grid-search thresholds to maximize PF subject to a DD constraint (simple approximation on validation).
# NOTE: Validation backtest is simplified; final backtest engine is used on test set.

def compute_trade_signals(df: pd.DataFrame, edge_probs: np.ndarray, entry_probs: np.ndarray, edge_th: float, entry_th: float) -> pd.Series:
    # signal at row i means we decide at close(i) and enter at open(i+1)
    sig = (df["macro_ok"].values == 1) & (edge_probs > edge_th) & (entry_probs > entry_th)
    return pd.Series(sig.astype(int), index=df.index, name="signal")

def quick_pf_proxy(df: pd.DataFrame, signal: pd.Series) -> float:
    # Proxy PF using 1R TP/SL outcomes from y_edge (already triple-barrier based)
    # Only evaluates where signal==1 and y_edge is available.
    mask = (signal == 1) & df["y_edge"].notna()
    if mask.sum() < 50:
        return 0.0
    y = df.loc[mask, "y_edge"].astype(int)
    # wins=1, losses=0; with TP:SL ratio approx tp_atr/sl_atr
    R_win = CFG.tp_atr / CFG.sl_atr
    wins = (y==1).sum()
    losses = (y==0).sum()
    gross_profit = wins * R_win
    gross_loss = losses * 1.0
    if gross_loss == 0:
        return float("inf")
    return gross_profit / gross_loss

# Prepare probs for validation set
pva_edge = edge_model.predict_proba(Xva_edge)[:,1]
pva_entry = entry_model.predict_proba(Xva_entry)[:,1]

edge_grid = np.round(np.linspace(0.55, 0.75, 9), 2)
entry_grid = np.round(np.linspace(0.50, 0.70, 9), 2)

best = (CFG.edge_th, CFG.entry_th, -1)
for eth in edge_grid:
    for ith in entry_grid:
        sig = compute_trade_signals(val, pva_edge, pva_entry, eth, ith)
        pf = quick_pf_proxy(val, sig)
        if pf > best[2]:
            best = (eth, ith, pf)
print("Best (proxy) thresholds on val:", best)

CFG.edge_th, CFG.entry_th = best[0], best[1]


Best (proxy) thresholds on val: (np.float64(0.62), np.float64(0.65), np.float64(3.111111111111111))


In [15]:
# ===== 11) Full Backtest Engine (next-open execution, spread/slippage, 1% risk sizing) =====
@dataclass
class Trade:
    entry_time: pd.Timestamp
    exit_time: pd.Timestamp
    entry_price: float
    exit_price: float
    sl: float
    tp: float
    lots: float
    pnl_usd: float
    pnl_pips: float
    outcome: str

def pip_to_price(pips: float) -> float:
    return pips * CFG.pip_size

def price_to_pips(delta_price: float) -> float:
    return delta_price / CFG.pip_size

def calc_lots_for_risk(equity: float, stop_pips: float) -> float:
    if stop_pips <= 0:
        return 0.0
    risk_usd = equity * CFG.risk_per_trade
    usd_per_pip = CFG.usd_per_pip_per_lot
    lots = risk_usd / (stop_pips * usd_per_pip)
    # avoid absurd leverage from tiny stops
    return float(max(0.0, min(lots, 20.0)))

def backtest(df: pd.DataFrame, edge_probs: np.ndarray, entry_probs: np.ndarray) -> tuple[list, pd.DataFrame]:
    df = df.reset_index(drop=True).copy()
    spread = pip_to_price(CFG.spread_pips)
    slip = pip_to_price(CFG.slippage_pips)

    in_pos = False
    trades: list[Trade] = []
    equity = CFG.starting_equity
    equity_curve = []

    entry_i = None
    entry_price = sl = tp = lots = None

    for i in range(len(df)-2):
        t = df.loc[i, "timestamp"]
        equity_curve.append((t, equity))

        # manage open position
        if in_pos:
            hi = df.loc[i, "high"]
            lo = df.loc[i, "low"]

            hit_sl = lo <= sl
            hit_tp = hi >= tp

            if hit_sl or hit_tp:
                # worst-case for long if both in same bar
                if hit_sl and hit_tp:
                    exit_price = sl
                    outcome = "SL(ambiguous)"
                elif hit_sl:
                    exit_price = sl
                    outcome = "SL"
                else:
                    exit_price = tp
                    outcome = "TP"

                # exit on bid (for long). Model bid = mid - spread/2, but we used prices as mid.
                # Approx: subtract half spread from exit (worse)
                exit_price = exit_price - spread/2 - slip

                pnl_price = exit_price - entry_price
                pnl_pips = price_to_pips(pnl_price)
                pnl_usd = pnl_pips * CFG.usd_per_pip_per_lot * lots

                # commission (round-trip)
                pnl_usd -= CFG.commission_per_lot_usd * lots

                equity += pnl_usd
                trades.append(Trade(
                    entry_time=df.loc[entry_i, "timestamp"],
                    exit_time=t,
                    entry_price=entry_price,
                    exit_price=exit_price,
                    sl=sl, tp=tp,
                    lots=lots,
                    pnl_usd=pnl_usd,
                    pnl_pips=pnl_pips,
                    outcome=outcome
                ))
                in_pos = False
                entry_i = None
            continue

        # no position: decide at close(i), enter at open(i+1)
        sig = (df.loc[i, "macro_ok"] == 1) and (edge_probs[i] > CFG.edge_th) and (entry_probs[i] > CFG.entry_th)
        if not sig:
            continue

        # compute entry next bar open with ask+slippage
        entry_i = i + 1
        raw_entry = df.loc[entry_i, "open"]
        entry_price = raw_entry + spread/2 + slip

        atr_i = df.loc[i, "atr14"]
        if not np.isfinite(atr_i) or atr_i <= 0:
            continue

        tp = entry_price + CFG.tp_atr * atr_i
        sl = entry_price - CFG.sl_atr * atr_i

        stop_pips = price_to_pips(entry_price - sl)
        lots = calc_lots_for_risk(equity, stop_pips)
        if lots <= 0:
            continue

        in_pos = True

    # final equity point
    if len(df) > 0:
        equity_curve.append((df.loc[len(df)-1, "timestamp"], equity))

    eq = pd.DataFrame(equity_curve, columns=["timestamp","equity"]).drop_duplicates("timestamp")
    eq["peak"] = eq["equity"].cummax()
    eq["dd"] = (eq["equity"] - eq["peak"]) / eq["peak"]
    return trades, eq

def perf_stats(trades: list[Trade], eq: pd.DataFrame) -> dict:
    if len(trades) == 0:
        return {"trades":0}
    pnl = np.array([t.pnl_usd for t in trades])
    gross_profit = pnl[pnl>0].sum()
    gross_loss = -pnl[pnl<0].sum()
    pf = gross_profit / gross_loss if gross_loss > 0 else float("inf")
    winrate = (pnl>0).mean()
    max_dd = float(eq["dd"].min()) if len(eq) else 0.0
    return {
        "trades": len(trades),
        "profit_factor": float(pf),
        "winrate": float(winrate),
        "net_profit_usd": float(pnl.sum()),
        "max_drawdown": float(max_dd),
        "avg_trade_usd": float(pnl.mean()),
    }


In [16]:
# ===== 12) Run Final Backtest on TEST (2024–2025) =====
# Prepare test features & probabilities
test_use = test_df.copy()
test_use = test_use.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(0).reset_index(drop=True)

Xte_edge = test_use[EDGE_FEATURES].copy()
Xte_entry = test_use[ENTRY_FEATURES].copy()

pte_edge = edge_model.predict_proba(Xte_edge)[:,1]
pte_entry = entry_model.predict_proba(Xte_entry)[:,1]

trades, eq = backtest(test_use, pte_edge, pte_entry)
stats = perf_stats(trades, eq)
stats


  test_use = test_use.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(0).reset_index(drop=True)


{'trades': 59,
 'profit_factor': 1.4817615163267073,
 'winrate': 0.5932203389830508,
 'net_profit_usd': 1467.6881177890225,
 'max_drawdown': -0.05799982894702798,
 'avg_trade_usd': 24.87606979303428}

In [17]:
# ===== 13) Inspect Trades & Equity Curve (basic) =====
trades_df = pd.DataFrame([t.__dict__ for t in trades]) if trades else pd.DataFrame()
print(trades_df.head(5))
print("\nLast trades:")
print(trades_df.tail(5))

print("\nEquity tail:")
display(eq.tail(10))

print("\nStats:")
for k,v in stats.items():
    print(f"{k}: {v}")


                 entry_time                 exit_time  entry_price  \
0 2024-02-26 08:15:00+00:00 2024-02-26 08:30:00+00:00      1.08286   
1 2024-04-04 07:00:00+00:00 2024-04-04 07:15:00+00:00      1.08407   
2 2024-04-09 19:15:00+00:00 2024-04-09 20:00:00+00:00      1.08500   
3 2024-05-09 00:15:00+00:00 2024-05-09 01:30:00+00:00      1.07440   
4 2024-05-13 08:30:00+00:00 2024-05-13 09:00:00+00:00      1.07777   

   exit_price        sl        tp      lots     pnl_usd  pnl_pips outcome  
0    1.083447  1.082360  1.083527  2.000381  117.330282  5.865396      TP  
1    1.084608  1.083606  1.084688  2.181057  117.449283  5.384971      TP  
2    1.085782  1.084353  1.085862  1.582403  123.804506  7.823831      TP  
3    1.074766  1.074066  1.074846  3.097325  113.335851  3.659152      TP  
4    1.078398  1.077239  1.078478  1.971838  123.850891  6.280986      TP  

Last trades:
                  entry_time                 exit_time  entry_price  \
54 2025-04-22 11:30:00+00:00 2025-04-2

Unnamed: 0,timestamp,equity,peak,dd
49796,2025-12-30 21:15:00+00:00,11467.688118,11596.176301,-0.01108
49797,2025-12-30 21:30:00+00:00,11467.688118,11596.176301,-0.01108
49798,2025-12-30 21:45:00+00:00,11467.688118,11596.176301,-0.01108
49799,2025-12-30 22:00:00+00:00,11467.688118,11596.176301,-0.01108
49800,2025-12-30 22:15:00+00:00,11467.688118,11596.176301,-0.01108
49801,2025-12-30 22:30:00+00:00,11467.688118,11596.176301,-0.01108
49802,2025-12-30 22:45:00+00:00,11467.688118,11596.176301,-0.01108
49803,2025-12-30 23:00:00+00:00,11467.688118,11596.176301,-0.01108
49804,2025-12-30 23:15:00+00:00,11467.688118,11596.176301,-0.01108
49805,2025-12-30 23:45:00+00:00,11467.688118,11596.176301,-0.01108



Stats:
trades: 59
profit_factor: 1.4817615163267073
winrate: 0.5932203389830508
net_profit_usd: 1467.6881177890225
max_drawdown: -0.05799982894702798
avg_trade_usd: 24.87606979303428


In [18]:
# ===== 14) Sanity Checks (anti-leakage + realism) =====
# 1) Ensure decision uses only past info: entry agg shifted, macro asof backward, entry at next open.
# 2) Costs included via spread/slippage.
# 3) Worst-case TP/SL ambiguity handling.

print("Entry agg shift check (should have NaN early, shifted by 1):")
print(test_use[["timestamp","m1_ret_mean","m1_ret_std"]].head(10))

print("\nMacro merge check (1h/4h features should be forward-filled, not future):")
print(test_use[["timestamp","1h_ema200_slope","4h_ema200_slope"]].head(10))

print("\nTrades per month (rough):")
if len(trades_df):
    trades_df["month"] = trades_df["entry_time"].dt.to_period("M").astype(str)
    display(trades_df.groupby("month").size().tail(24))


Entry agg shift check (should have NaN early, shifted by 1):
                  timestamp   m1_ret_mean  m1_ret_std
0 2024-01-01 22:00:00+00:00  0.000000e+00    0.000000
1 2024-01-01 22:15:00+00:00  1.293644e-06    0.000013
2 2024-01-01 22:30:00+00:00 -6.468192e-07    0.000030
3 2024-01-01 22:45:00+00:00  2.263847e-06    0.000008
4 2024-01-01 23:00:00+00:00  3.018353e-06    0.000033
5 2024-01-01 23:15:00+00:00 -3.320940e-05    0.000053
6 2024-01-01 23:30:00+00:00 -4.227838e-06    0.000030
7 2024-01-01 23:45:00+00:00 -5.436186e-06    0.000040
8 2024-01-02 00:00:00+00:00 -1.294394e-06    0.000041
9 2024-01-02 00:15:00+00:00  3.624239e-06    0.000036

Macro merge check (1h/4h features should be forward-filled, not future):
                  timestamp  1h_ema200_slope  4h_ema200_slope
0 2024-01-01 22:00:00+00:00              0.0              0.0
1 2024-01-01 22:15:00+00:00              0.0              0.0
2 2024-01-01 22:30:00+00:00              0.0              0.0
3 2024-01-01 22:45:00+0

  trades_df["month"] = trades_df["entry_time"].dt.to_period("M").astype(str)


month
2024-02     1
2024-04     2
2024-05     7
2024-06     4
2024-07    10
2024-08    19
2025-03     8
2025-04     4
2025-11     1
2025-12     3
dtype: int64

## Дараагийн алхамууд
- `CFG.spread_pips`, `CFG.slippage_pips`, `CFG.commission_per_lot_usd`-ийг өөрийн брокерын бодит утгаар тохируул.
- `tp_atr`, `sl_atr`, `max_hold_bars`, threshold-уудыг 2022–2023 validation дээр илүү нарийвчлан grid/random search хийж болно.
- Хэрвээ trade хэт цөөн гарч байвал macro gate-г бага зэрэг сулруул (ж: ADX босго, EMA200 slope).
