In [1]:
# ===== 1) Imports & Configuration (Improved for PF > 3) =====
import os, glob, math, warnings
import numpy as np
import pandas as pd
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_score

# Try import XGBoost
HAS_XGB = False
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    pass

warnings.filterwarnings("ignore")

class CFG:
    # Adjust paths as needed
    train_dir = "../data/train"
    test_dir  = "../data/test"
    pair_hint = "EURUSD"

    # Execution Costs
    spread_pips: float = 1.0
    slippage_pips: float = 0.2
    commission_per_lot_usd: float = 0.0
    risk_per_trade: float = 0.01

    # Strategy Goals: PF > 3.0 requires High Reward:Risk or Ultra High Winrate.
    # We choose High Reward:Risk (2:1) + Moderate Winrate (>50%).
    # Target: Win 55-60% with 2:1 Reward => PF ~ 3.0
    
    tp_atr: float = 3.0   # Large Target (Trend following)
    sl_atr: float = 1.5   # Tighter Stop
    max_hold_bars: int = 24  # 6 hours on 15m (give trade room to breathe)
    
    # Filters
    min_volatility_pips: float = 5.0 # Skip dead markets
    
    # Split
    core_end = "2021-12-31"
    val_end  = "2023-12-31"

cfg = CFG()

# ===== Helper Functions (Data Loading) =====
def _guess_tf_from_name(name: str) -> str:
    n = name.lower()
    if "m15" in n or "15m" in n: return "15m"
    if "h1" in n or "1h" in n: return "1h"
    if "h4" in n or "4h" in n: return "4h"
    if "m1" in n or "1m" in n: return "1m"
    return ""

def _pick_files(data_dir: str, pair_hint: str):
    if not os.path.exists(data_dir):
        if os.path.exists(os.path.join("..", data_dir)):
             data_dir = os.path.join("..", data_dir)
             
    files = glob.glob(os.path.join(data_dir, "*.csv")) + glob.glob(os.path.join(data_dir, "*.parquet"))
    if not files:
        abs_path = os.path.abspath(data_dir)
        files = glob.glob(os.path.join(abs_path, "*.csv"))
    if not files: raise FileNotFoundError(f"No data in {data_dir}")

    hinted = [f for f in files if pair_hint.lower() in os.path.basename(f).lower()]
    if hinted: files = hinted
    
    by_tf = {}
    for f in files:
        tf = _guess_tf_from_name(os.path.basename(f))
        if tf and tf not in by_tf: by_tf[tf] = f
    return by_tf

def _standardize_ohlc(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [c.lower() for c in df.columns]
    rename_map = {}
    for c in df.columns:
        if "time" in c or "date" in c: rename_map[c] = "time"
        elif "open" in c: rename_map[c] = "open"
        elif "high" in c: rename_map[c] = "high"
        elif "low" in c: rename_map[c] = "low"
        elif "close" in c: rename_map[c] = "close"
        elif "vol" in c: rename_map[c] = "volume"
    df = df.rename(columns=rename_map)
    df["time"] = pd.to_datetime(df["time"], utc=True, errors='coerce')
    df = df.dropna(subset=["time"]).sort_values("time").set_index("time")
    cols = ["open","high","low","close","volume"]
    for c in cols:
        if c in df.columns: df[c] = df[c].astype(float)
    return df[ [c for c in cols if c in df.columns] ]

def load_tf(data_dir: str, pair_hint: str):
    by_tf = _pick_files(data_dir, pair_hint)
    out = {}
    for tf in ["1m","15m","1h","4h"]:
        if tf in by_tf:
            try:
                out[tf] = _standardize_ohlc(pd.read_csv(by_tf[tf]))
                print(f"Loaded {tf}: {len(out[tf])} rows")
            except Exception as e:
                print(f"Error loading {by_tf[tf]}: {e}")
    return out

In [2]:
# ===== 2) Advanced Feature Engineering (Trend + Volatility) =====
def ema(s: pd.Series, span: int) -> pd.Series:
    return s.ewm(span=span, adjust=False).mean()

def rsi(s: pd.Series, n: int=14) -> pd.Series:
    delta = s.diff()
    up = delta.clip(lower=0)
    dn = -1 * delta.clip(upper=0)
    ema_up = up.ewm(com=n-1, adjust=False).mean()
    ema_dn = dn.ewm(com=n-1, adjust=False).mean()
    rs = ema_up / (ema_dn + 1e-9)
    return 100 - (100 / (1 + rs))

def atr(df: pd.DataFrame, n: int=14) -> pd.Series:
    h,l,c = df["high"], df["low"], df["close"]
    prev_c = c.shift(1)
    tr = pd.concat([(h-l).abs(), (h-prev_c).abs(), (l-prev_c).abs()], axis=1).max(axis=1)
    return tr.rolling(n).mean()

def adx(df: pd.DataFrame, n: int=14) -> pd.Series:
    h,l,c = df["high"], df["low"], df["close"]
    up, dn = h.diff(), -l.diff()
    plus_dm = np.where((up > dn) & (up > 0), up, 0.0)
    minus_dm = np.where((dn > up) & (dn > 0), dn, 0.0)
    tr = pd.concat([(h-l).abs(), (h-c.shift(1)).abs(), (l-c.shift(1)).abs()], axis=1).max(axis=1)
    atr_n = tr.rolling(n).sum()
    plus_di = 100 * pd.Series(plus_dm, index=df.index).rolling(n).sum() / (atr_n+1e-9)
    minus_di= 100 * pd.Series(minus_dm, index=df.index).rolling(n).sum() / (atr_n+1e-9)
    dx = (100 * (plus_di - minus_di).abs() / (plus_di + minus_di)).replace([np.inf,-np.inf], 0)
    return dx.rolling(n).mean()

def add_features(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    out = pd.DataFrame(index=df.index)
    c = df["close"]
    
    # 1. Trend
    e50 = ema(c, 50)
    e200 = ema(c, 200)
    out[f"{prefix}trend_biase"] = np.where(e50 > e200, 1, -1)
    out[f"{prefix}dist_e200"] = (c - e200) / (e200 + 1e-9)
    out[f"{prefix}adx"] = adx(df, 14)
    
    # 2. Momentum
    out[f"{prefix}rsi"] = rsi(c, 14)
    out[f"{prefix}rsi_slope"] = out[f"{prefix}rsi"].diff(3)
    
    # 3. Volatility
    out[f"{prefix}atr"] = atr(df, 14)
    out[f"{prefix}atr_rel"] = out[f"{prefix}atr"] / c # Normalized ATR
    out[f"{prefix}bb_width"] = (df["close"].rolling(20).std() * 2 * 2) / df["close"].rolling(20).mean()
    
    # 4. Candlestick Anatomy
    out[f"{prefix}body_rel"] = (df["close"] - df["open"]).abs() / (df["high"] - df["low"] + 1e-9)
    
    # 5. Time
    if prefix == "15m_":
        out["hour"] = df.index.hour
        out["dayofweek"] = df.index.dayofweek
    
    return out

def build_master(bundle: dict) -> pd.DataFrame:
    m15 = bundle["15m"].copy()
    f15 = add_features(m15, "15m_")
    df = m15.join(f15)
    
    # Higher TF Context (Trend)
    for tf in ["1h", "4h"]:
        if tf in bundle:
            feat = add_features(bundle[tf], f"{tf}_")
            
            # CRITICAL FIX: Shift higher TF features by 1 to prevent future leakage.
            # 1h candle at 10:00 closes at 11:00. 
            # We shift it so that 10:00 index contains data from 09:00 (which is closed/known).
            feat = feat.shift(1)
            
            aligned = feat.reindex(df.index, method="ffill")
            df = df.join(aligned)
            
    # Returns (Lagged)
    for i in [1, 3, 6, 12]:
        df[f"ret{i}"] = df["close"].pct_change(i)
        
    return df

# Load & Build
train_tf = load_tf(cfg.train_dir, cfg.pair_hint)
master_df = build_master(train_tf)
master_df = master_df.dropna()
print("Master DF shape:", master_df.shape)

Loaded 1m: 3354904 rows
Loaded 15m: 224382 rows
Loaded 1h: 56098 rows
Loaded 4h: 14498 rows
Master DF shape: (223974, 38)


In [3]:
# ===== 3) Labeling (Triple Barrier) =====
def get_triple_barrier_labels(df: pd.DataFrame):
    # vectorized approximation is risky for exact hit-sequence, so we use loop
    # 0 = No Enter (Flat/Loss), 1 = Profit
    # Actually, let's target Direction.
    # But for "Entry" model, we want: "If I enter now, will I hit TP before SL?"
    
    n = len(df)
    y = np.zeros(n)
    
    closes = df["close"].values
    highs = df["high"].values
    lows = df["low"].values
    atrs = df["15m_atr"].values
    opens = df["open"].values # Fill on Next Open
    
    tp_mult = cfg.tp_atr
    sl_mult = cfg.sl_atr
    hold = cfg.max_hold_bars
    
    for i in range(n - hold - 1):
        # Trade Logic
        # Entry at Open[i+1]
        entry_px = opens[i+1]
        vol = atrs[i]
        
        # Volatility Filter applied in logic? No, label truth is independent of filter.
        # But if vol is NaN or 0, skip.
        if np.isnan(vol) or vol <= 0: continue
        
        tp = entry_px + vol * tp_mult # Long bias label
        sl = entry_px - vol * sl_mult
        
        outcome = 0
        for j in range(1, hold + 1):
            h = highs[i+j]
            l = lows[i+j]
            
            # Check SL First (Conservative)
            if l <= sl:
                outcome = 0
                break
            if h >= tp:
                outcome = 1
                break
        
        y[i] = outcome
        
    return y

master_df["y"] = get_triple_barrier_labels(master_df)
print("Target Balance:", master_df["y"].value_counts(normalize=True).to_dict())

# Features List
FEATURES = [
    "15m_rsi", "15m_rsi_slope", "15m_adx", "15m_bb_width", "15m_atr_rel",
    "15m_dist_e200", "15m_body_rel",
    "1h_trend_biase", "1h_adx", "1h_rsi",
    "4h_trend_biase",
    "ret1", "ret3", "ret6", "ret12",
    "hour"
]

# Clean
master_df = master_df.dropna(subset=FEATURES + ["y"])

# Split
train = master_df.loc[:cfg.core_end]
val   = master_df.loc[cfg.core_end:cfg.val_end]

print(f"Train size: {len(train)}, Val size: {len(val)}")


Target Balance: {0.0: 0.7557216462625126, 1.0: 0.24427835373748738}
Train size: 174115, Val size: 49947


In [4]:
# ===== 4) Train Ensemble (Primary) =====
X_train = train[FEATURES]
y_train = train["y"]
X_val   = val[FEATURES]
y_val   = val["y"]

# 1. XGB
if HAS_XGB:
    model_xgb = XGBClassifier(
        n_estimators=300, max_depth=5, learning_rate=0.03,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
    )
    model_xgb.fit(X_train, y_train)
    p_xgb = model_xgb.predict_proba(X_val)[:, 1]
    p_xgb_train = model_xgb.predict_proba(X_train)[:, 1]
else:
    p_xgb = np.zeros(len(val))
    p_xgb_train = np.zeros(len(train))

# 2. RF
model_rf = RandomForestClassifier(
    n_estimators=200, max_depth=8, min_samples_leaf=50, 
    random_state=42, n_jobs=-1
)
model_rf.fit(X_train, y_train)
p_rf = model_rf.predict_proba(X_val)[:, 1]
p_rf_train = model_rf.predict_proba(X_train)[:, 1]

# 3. LR
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
model_lr = LogisticRegression(C=0.1, class_weight='balanced')
model_lr.fit(X_train_s, y_train)
p_lr = model_lr.predict_proba(X_val_s)[:, 1]
p_lr_train = model_lr.predict_proba(X_train_s)[:, 1]

# Ensemble
if HAS_XGB:
    p_ens_train = (p_xgb_train + p_rf_train + p_lr_train) / 3
    p_ens_val   = (p_xgb + p_rf + p_lr) / 3
else:
    p_ens_train = (p_rf_train + p_lr_train) / 2
    p_ens_val   = (p_rf + p_lr) / 2

print("Ensemble AUC (Val):", roc_auc_score(y_val, p_ens_val))


Ensemble AUC (Val): 0.6224954884559099


In [5]:
# ===== 5) Volatility Filter & Fast Backtest =====
def advanced_backtest(df, probs, prob_threshold):
    # Apply Volatility Filter
    # 5 pips = 0.0005
    min_vol = cfg.min_volatility_pips * 0.0001
    
    atrs = df["15m_atr"].values
    opens = df["open"].values
    highs = df["high"].values
    lows  = df["low"].values
    closes= df["close"].values
    
    hold = cfg.max_hold_bars
    tp_mult = cfg.tp_atr
    sl_mult = cfg.sl_atr
    
    trades_pnl = []
    
    i = 0
    end_idx = len(df) - hold - 1
    
    while i < end_idx:
        # 1. Filter Check
        curr_vol = atrs[i]
        if np.isnan(curr_vol) or curr_vol < min_vol:
            i += 1; continue
            
        # 2. Probability Check
        if probs[i] < prob_threshold:
            i += 1; continue
            
        # 3. Enter Trade
        entry_px = opens[i+1]
        tp_px = entry_px + curr_vol * tp_mult
        sl_px = entry_px - curr_vol * sl_mult
        
        outcome = "time"
        pnl_r = 0.0
        
        for j in range(1, hold + 1):
            h = highs[i+j]
            l = lows[i+j]
            
            hit_sl = l <= sl_px
            hit_tp = h >= tp_px
            
            if hit_sl and hit_tp:
                pnl_r = -1.0 # SL First assumption
                outcome = "sl"
                break
            elif hit_sl:
                pnl_r = -1.0
                outcome = "sl"
                break
            elif hit_tp:
                # Reward Value = TP Dist / Risk Dist = 2.0 / 1.0 = 2.0
                pnl_r = float(tp_mult / sl_mult)
                outcome = "tp"
                break
        
        if outcome == "time":
            exit_px = closes[i+hold]
            raw_pnl = exit_px - entry_px
            risk_dist = entry_px - sl_px
            pnl_r = raw_pnl / risk_dist
            
        trades_pnl.append(pnl_r)
        
        # Skip overlapping trades to simulate simple portfolio
        i += j
        
    trades_pnl = np.array(trades_pnl)
    if len(trades_pnl) == 0: return 0, 0.0, 0.0
    
    wins = trades_pnl[trades_pnl > 0]
    losses = trades_pnl[trades_pnl <= 0]
    
    n = len(trades_pnl)
    wr = len(wins) / n
    
    gross_win = wins.sum()
    gross_loss = -losses.sum()
    pf = gross_win / gross_loss if gross_loss > 0 else 999.0
    
    return n, wr, pf

# Sweep
print(f"{'Threshold':<10} | {'Trades':<8} | {'Winrate':<8} | {'PF':<8}")
print("-" * 50)
results = []
for th in [0.50, 0.52, 0.54, 0.55, 0.56, 0.58, 0.60]:
    n, wr, pf = advanced_backtest(val, p_ens_val, th)
    results.append({"Threshold": th, "Trades": n, "Winrate": wr, "PF": pf})
    print(f"{th:<10.2f} | {n:<8} | {wr:<8.1%} | {pf:<8.2f}")


Threshold  | Trades   | Winrate  | PF      
--------------------------------------------------
0.50       | 0        | 0.0%     | 0.00    
0.52       | 0        | 0.0%     | 0.00    
0.54       | 0        | 0.0%     | 0.00    
0.55       | 0        | 0.0%     | 0.00    
0.56       | 0        | 0.0%     | 0.00    
0.58       | 0        | 0.0%     | 0.00    
0.60       | 0        | 0.0%     | 0.00    
