In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
MODELS_DIR = BASE_DIR / 'models'

print("="*70)
print("üìä FOREX SIGNAL MODELS COMPARISON")
print("   V2, V3, V4, V5, V6, V7, V8, V9, V10")
print("="*70)

üìä FOREX SIGNAL MODELS COMPARISON
   V2, V3, V4, V5, V6, V7, V8, V9, V10


## 1. Load Test Data

In [10]:
# Load test data
test_df_raw = pd.read_csv(DATA_DIR / 'EUR_USD_test.csv')
if 'timestamp' in test_df_raw.columns:
    test_df_raw.rename(columns={'timestamp': 'time'}, inplace=True)
test_df_raw['time'] = pd.to_datetime(test_df_raw['time'])

print(f"Test data: {len(test_df_raw):,} rows")
print(f"Period: {test_df_raw['time'].min()} to {test_df_raw['time'].max()}")

Test data: 296,778 rows
Period: 2024-12-31 16:00:00+00:00 to 2025-10-17 06:11:00+00:00


## 2. Feature Engineering Functions

In [11]:
def add_features_v2_v3(df):
    """V2, V3 Features"""
    df = df.copy()
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    
    for p in [5, 10, 20, 50]:
        df[f'sma_{p}'] = df['close'].rolling(p).mean()
        df[f'ema_{p}'] = df['close'].ewm(span=p).mean()
    
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    df['rsi'] = 100 - (100 / (1 + gain / (loss + 1e-10)))
    
    ema12 = df['close'].ewm(span=12).mean()
    ema26 = df['close'].ewm(span=26).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    df['bb_mid'] = df['close'].rolling(20).mean()
    df['bb_std'] = df['close'].rolling(20).std()
    df['bb_upper'] = df['bb_mid'] + 2 * df['bb_std']
    df['bb_lower'] = df['bb_mid'] - 2 * df['bb_std']
    
    df['returns'] = df['close'].pct_change()
    df['volatility'] = df['returns'].rolling(20).std()
    
    return df

def add_features_v6_v7_v8_v9_v10(df):
    """V6, V7, V8, V9, V10 Features (extended)"""
    df = df.copy()
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['is_london'] = ((df['hour'] >= 8) & (df['hour'] < 16)).astype(int)
    df['is_ny'] = ((df['hour'] >= 13) & (df['hour'] < 21)).astype(int)
    df['is_overlap'] = ((df['hour'] >= 13) & (df['hour'] < 16)).astype(int)
    
    for p in [5, 10, 20, 50, 200]:
        df[f'sma_{p}'] = df['close'].rolling(p).mean()
        df[f'ema_{p}'] = df['close'].ewm(span=p, adjust=False).mean()
    
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    df['rsi'] = 100 - (100 / (1 + gain / (loss + 1e-10)))
    
    ema12 = df['close'].ewm(span=12).mean()
    ema26 = df['close'].ewm(span=26).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    df['macd_momentum'] = df['macd_hist'] - df['macd_hist'].shift(3)
    
    df['bb_mid'] = df['close'].rolling(20).mean()
    df['bb_std'] = df['close'].rolling(20).std()
    df['bb_upper'] = df['bb_mid'] + 2 * df['bb_std']
    df['bb_lower'] = df['bb_mid'] - 2 * df['bb_std']
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / (df['bb_mid'] + 1e-10)
    
    # ADX
    df['tr0'] = abs(df['high'] - df['low'])
    df['tr1'] = abs(df['high'] - df['close'].shift())
    df['tr2'] = abs(df['low'] - df['close'].shift())
    df['tr'] = df[['tr0', 'tr1', 'tr2']].max(axis=1)
    df['up_move'] = df['high'] - df['high'].shift()
    df['down_move'] = df['low'].shift() - df['low']
    df['plus_dm'] = np.where((df['up_move'] > df['down_move']) & (df['up_move'] > 0), df['up_move'], 0)
    df['minus_dm'] = np.where((df['down_move'] > df['up_move']) & (df['down_move'] > 0), df['down_move'], 0)
    period = 14
    df['atr'] = df['tr'].rolling(period).mean()
    df['plus_di'] = 100 * (df['plus_dm'].rolling(period).mean() / (df['atr'] + 1e-10))
    df['minus_di'] = 100 * (df['minus_dm'].rolling(period).mean() / (df['atr'] + 1e-10))
    df['dx'] = 100 * abs(df['plus_di'] - df['minus_di']) / (df['plus_di'] + df['minus_di'] + 1e-10)
    df['adx'] = df['dx'].rolling(period).mean()
    
    # CCI - Optimized (avoid slow apply)
    tp = (df['high'] + df['low'] + df['close']) / 3
    sma_tp = tp.rolling(20).mean()
    # Use std as approximation for MAD (faster)
    std_tp = tp.rolling(20).std()
    df['cci'] = (tp - sma_tp) / (0.015 * std_tp + 1e-10)
    
    # Williams %R
    hh = df['high'].rolling(14).max()
    ll = df['low'].rolling(14).min()
    df['williams_r'] = -100 * (hh - df['close']) / (hh - ll + 1e-10)
    
    df['returns'] = df['close'].pct_change()
    df['volatility'] = df['returns'].rolling(20).std() * 100
    
    # Composite features
    df['rsi_x_adx'] = df['rsi'] * df['adx'] / 100
    df['momentum_score'] = (
        (df['rsi'] > 50).astype(int) + 
        (df['macd'] > df['macd_signal']).astype(int) + 
        (df['plus_di'] > df['minus_di']).astype(int)
    )
    df['price_position'] = (df['close'] - df['sma_50']) / (df['atr'] + 1e-10)
    df['trend_score'] = (
        (df['close'] > df['sma_20']).astype(int) +
        (df['sma_20'] > df['sma_50']).astype(int) +
        (df['sma_50'] > df['sma_200']).astype(int) +
        (df['adx'] > 25).astype(int)
    )
    df['rsi_zone'] = pd.cut(df['rsi'], bins=[0, 30, 45, 55, 70, 100], labels=[0, 1, 2, 3, 4]).astype(float)
    df['close_vs_high'] = (df['high'].rolling(20).max() - df['close']) / (df['atr'] + 1e-10)
    df['close_vs_low'] = (df['close'] - df['low'].rolling(20).min()) / (df['atr'] + 1e-10)
    df['volume_ratio'] = 1.0
    
    # Cleanup
    drop_cols = ['tr0', 'tr1', 'tr2', 'tr', 'up_move', 'down_move', 'plus_dm', 'minus_dm']
    df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
    
    return df

def add_features_v10_extended(df):
    """V10 Extended Features"""
    df = add_features_v6_v7_v8_v9_v10(df)
    
    # V10 specific features
    df['trend_strength'] = (
        (df['close'] > df['ema_5']).astype(int) +
        (df['ema_5'] > df['ema_10']).astype(int) +
        (df['ema_10'] > df['ema_20']).astype(int) +
        (df['ema_20'] > df['ema_50']).astype(int) +
        (df['adx'] > 20).astype(int)
    )
    
    df['momentum_alignment'] = (
        (df['rsi'] > 55).astype(int) +
        (df['macd_hist'] > 0).astype(int) +
        (df['cci'] > 50).astype(int) +
        (df['williams_r'] > -30).astype(int)
    )
    
    df['volatility_sma'] = df['volatility'].rolling(50).mean()
    df['volatility_state'] = np.where(
        df['volatility'] > df['volatility_sma'] * 1.5, 2,
        np.where(df['volatility'] < df['volatility_sma'] * 0.5, 0, 1)
    )
    
    df['body'] = df['close'] - df['open']
    df['upper_wick'] = df['high'] - df[['open', 'close']].max(axis=1)
    df['lower_wick'] = df[['open', 'close']].min(axis=1) - df['low']
    df['body_ratio'] = abs(df['body']) / (df['high'] - df['low'] + 1e-10)
    df['is_bullish'] = (df['close'] > df['open']).astype(int)
    df['bullish_streak'] = df['is_bullish'].rolling(5).sum()
    
    df['dist_to_high20'] = (df['high'].rolling(20).max() - df['close']) / (df['atr'] + 1e-10)
    df['dist_to_low20'] = (df['close'] - df['low'].rolling(20).min()) / (df['atr'] + 1e-10)
    
    df['rsi_5'] = df['rsi'].rolling(5).mean()
    df['rsi_20'] = df['rsi'].rolling(20).mean()
    df['rsi_trend'] = df['rsi_5'] - df['rsi_20']
    
    df['above_bb_upper'] = (df['close'] > df['bb_upper']).astype(int)
    df['below_bb_lower'] = (df['close'] < df['bb_lower']).astype(int)
    df['bb_breakout'] = df['above_bb_upper'] - df['below_bb_lower']
    
    df['price_change_5'] = (df['close'] - df['close'].shift(5)) / (df['atr'] + 1e-10)
    df['price_change_10'] = (df['close'] - df['close'].shift(10)) / (df['atr'] + 1e-10)
    df['price_change_20'] = (df['close'] - df['close'].shift(20)) / (df['atr'] + 1e-10)
    
    df['session_quality'] = df['is_london'].astype(int) + df['is_ny'].astype(int) + df['is_overlap'].astype(int) * 2
    
    return df

def create_labels(df, forward_periods=60, min_pips=15, ratio=1.5):
    """Create BUY/SELL labels"""
    df = df.copy()
    min_move = min_pips * 0.0001
    
    df['future_max'] = df['high'].rolling(forward_periods).max().shift(-forward_periods)
    df['future_min'] = df['low'].rolling(forward_periods).min().shift(-forward_periods)
    
    df['up_move'] = df['future_max'] - df['close']
    df['down_move'] = df['close'] - df['future_min']
    
    conditions = [
        (df['up_move'] >= min_move) & (df['up_move'] > df['down_move'] * ratio),
        (df['down_move'] >= min_move) & (df['down_move'] > df['up_move'] * ratio)
    ]
    choices = [1, 0]
    df['signal'] = np.select(conditions, choices, default=-1)
    
    df.drop(['future_max', 'future_min', 'up_move', 'down_move'], axis=1, inplace=True)
    return df

print("‚úì Feature functions defined")

‚úì Feature functions defined


## 3. Evaluate Each Model

In [12]:
# Store results
all_results = {}

def evaluate_model(version, test_df, feature_func, model_names, ensemble_type='weighted'):
    """
    Evaluate a model version
    """
    model_dir = MODELS_DIR / f'signal_generator_{version}'
    
    if not model_dir.exists():
        return None
    
    try:
        # Load models and preprocessing
        scaler = joblib.load(model_dir / f'scaler_{version}.joblib')
        feature_cols = joblib.load(model_dir / f'feature_cols_{version}.joblib')
        
        # Prepare test data
        test_prepared = feature_func(test_df.copy())
        test_prepared = create_labels(test_prepared)
        test_binary = test_prepared[test_prepared['signal'] != -1].copy()
        
        # Add missing columns
        for c in feature_cols:
            if c not in test_binary.columns:
                test_binary[c] = 0
        
        test_clean = test_binary.dropna(subset=feature_cols).copy()
        X_test = test_clean[feature_cols].values
        X_test_scaled = scaler.transform(X_test)
        y_test = test_clean['signal'].values
        
        # Load and run models
        models = {}
        predictions = {}
        probabilities = {}
        
        for name in model_names:
            model_path = model_dir / f'{name}_{version}.joblib'
            if model_path.exists():
                models[name] = joblib.load(model_path)
                predictions[name] = models[name].predict(X_test_scaled)
                probabilities[name] = models[name].predict_proba(X_test_scaled)
        
        if not models:
            return None
        
        # Load weights if available
        weights_path = model_dir / f'weights_{version}.joblib'
        if weights_path.exists():
            weights = joblib.load(weights_path)
        else:
            # Equal weights
            weights = {name: 1/len(models) for name in models.keys()}
        
        # Ensemble predictions
        first_model = list(models.keys())[0]
        final_proba = np.zeros_like(probabilities[first_model])
        for name in models.keys():
            w = weights.get(name, 1/len(models))
            final_proba += w * probabilities[name]
        
        buy_prob = final_proba[:, 1] * 100
        
        # Agreement bonus
        all_preds = np.array([predictions[name] for name in models.keys()])
        buy_votes = np.sum(all_preds == 1, axis=0)
        n_models = len(models)
        
        all_agree = buy_votes == n_models
        most_agree = buy_votes >= (n_models - 1)
        
        confidence = buy_prob.copy()
        confidence[all_agree] = np.minimum(confidence[all_agree] + 5, 100)
        confidence[most_agree & ~all_agree] = np.minimum(confidence[most_agree & ~all_agree] + 2, 100)
        
        # Calculate results at different thresholds
        results = {
            'version': version,
            'n_models': len(models),
            'total_signals': len(y_test),
            'thresholds': {}
        }
        
        for thresh in [50, 60, 70, 75, 80, 85, 90]:
            mask = confidence >= thresh
            if mask.sum() > 0:
                signals = mask.sum()
                correct = y_test[mask].sum()
                acc = correct / signals * 100
                results['thresholds'][thresh] = {
                    'signals': int(signals),
                    'correct': int(correct),
                    'accuracy': acc
                }
        
        return results
        
    except Exception as e:
        print(f"  Error evaluating {version}: {e}")
        return None

print("‚úì Evaluation function defined")

‚úì Evaluation function defined


## 4. Evaluate V2, V3

In [13]:
print("="*70)
print("Evaluating V2, V3...")
print("="*70)

# V2
result = evaluate_model('v2', test_df_raw, add_features_v2_v3, ['xgboost', 'lightgbm', 'rf'])
if result:
    all_results['V2'] = result
    print(f"‚úì V2: {result['n_models']} models")

# V3
result = evaluate_model('v3', test_df_raw, add_features_v2_v3, ['xgboost', 'lightgbm', 'rf'])
if result:
    all_results['V3'] = result
    print(f"‚úì V3: {result['n_models']} models")

Evaluating V2, V3...
‚úì V2: 3 models
‚úì V2: 3 models
‚úì V3: 3 models
‚úì V3: 3 models


## 5. Evaluate V4, V5

In [15]:
print("="*70)
print("Evaluating V4, V5...")
print("="*70)

# V4 - Stacking/Voting structure with ATR-based labels
def add_features_v4(df):
    """V4 Features with ATR"""
    df = df.copy()
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    df['is_london'] = ((df['hour'] >= 7) & (df['hour'] < 16)).astype(int)
    df['is_ny'] = ((df['hour'] >= 12) & (df['hour'] < 21)).astype(int)
    df['is_overlap'] = ((df['hour'] >= 12) & (df['hour'] < 16)).astype(int)
    
    for period in [10, 20, 50, 200]:
        df[f'sma_{period}'] = df['close'].rolling(period).mean()
        df[f'ema_{period}'] = df['close'].ewm(span=period, adjust=False).mean()
    
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / (loss + 1e-10)
    df['rsi_14'] = 100 - (100 / (1 + rs))
    
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    high_low = df['high'] - df['low']
    high_close = abs(df['high'] - df['close'].shift())
    low_close = abs(df['low'] - df['close'].shift())
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['atr_14'] = tr.rolling(14).mean()
    
    df['bb_middle'] = df['close'].rolling(20).mean()
    bb_std = df['close'].rolling(20).std()
    df['bb_upper'] = df['bb_middle'] + 2 * bb_std
    df['bb_lower'] = df['bb_middle'] - 2 * bb_std
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
    
    df['dist_sma50'] = (df['close'] - df['sma_50']) / df['sma_50'] * 100
    df['dist_sma200'] = (df['close'] - df['sma_200']) / df['sma_200'] * 100
    
    df['atr_50'] = tr.rolling(50).mean()
    df['volatility_ratio'] = df['atr_14'] / (df['atr_50'] + 1e-10)
    
    df['mom_10'] = df['close'].pct_change(10)
    df['mom_30'] = df['close'].pct_change(30)
    
    for col in ['rsi_14', 'macd', 'mom_10', 'volatility_ratio']:
        df[f'{col}_lag1'] = df[col].shift(1)
        df[f'{col}_lag2'] = df[col].shift(2)
        df[f'{col}_lag3'] = df[col].shift(3)
    
    return df

def create_labels_v4(df, forward_periods=60):
    """V4 ATR-based labels"""
    df = df.copy()
    closes = df['close'].values
    highs = df['high'].values
    lows = df['low'].values
    atrs = df['atr_14'].values
    n = len(df)
    labels = []
    
    for i in range(n):
        if i + forward_periods >= n or np.isnan(atrs[i]):
            labels.append(0)
            continue
        entry = closes[i]
        atr = atrs[i]
        tp_dist = atr * 1.5
        sl_dist = atr * 1.0
        tp_price = entry + tp_dist
        sl_price = entry - sl_dist
        future_highs = highs[i+1 : i+forward_periods+1]
        future_lows = lows[i+1 : i+forward_periods+1]
        sl_hit = np.where(future_lows <= sl_price)[0]
        first_sl = sl_hit[0] if len(sl_hit) > 0 else forward_periods + 1
        tp_hit = np.where(future_highs >= tp_price)[0]
        first_tp = tp_hit[0] if len(tp_hit) > 0 else forward_periods + 1
        labels.append(1 if first_tp < first_sl else 0)
    
    df['target'] = labels
    return df

v4_dir = MODELS_DIR / 'signal_generator_v4'
if v4_dir.exists():
    try:
        # Load V4 models
        v4_scaler = joblib.load(v4_dir / 'scaler_v4.joblib')
        v4_features = joblib.load(v4_dir / 'features_v4.joblib')  # Selected 20 features
        v4_selector = joblib.load(v4_dir / 'selector_v4.joblib')
        v4_stacking = joblib.load(v4_dir / 'stacking_v4.joblib')
        v4_voting = joblib.load(v4_dir / 'voting_v4.joblib')
        
        # Prepare test data with V4 features
        test_v4 = add_features_v4(test_df_raw.copy())
        test_v4 = create_labels_v4(test_v4)
        
        # Get ALL feature columns (not just selected) - same as training
        exclude_cols = ['time', 'target', 'open', 'high', 'low', 'close', 'tick_volume', 'volume']
        all_feature_cols = [c for c in test_v4.columns if c not in exclude_cols]
        
        test_v4_clean = test_v4.dropna().copy()
        
        # Scale ALL features first (scaler expects 40 features)
        X_test_v4_all = test_v4_clean[all_feature_cols].values
        X_test_v4_scaled = v4_scaler.transform(X_test_v4_all)
        
        # Then select features (selector reduces from 40 to 20)
        X_test_v4_selected = v4_selector.transform(X_test_v4_scaled)
        y_test_v4 = test_v4_clean['target'].values
        
        # Predict with voting model (more reliable than stacking)
        v4_proba = v4_voting.predict_proba(X_test_v4_selected)
        v4_confidence = v4_proba[:, 1] * 100
        
        # Calculate results
        v4_results = {'version': 'v4', 'n_models': 3, 'total_signals': len(y_test_v4), 'thresholds': {}}
        for thresh in [50, 60, 70, 75, 80, 85, 90]:
            mask = v4_confidence >= thresh
            if mask.sum() > 0:
                signals = mask.sum()
                correct = y_test_v4[mask].sum()
                v4_results['thresholds'][thresh] = {
                    'signals': int(signals),
                    'correct': int(correct),
                    'accuracy': correct / signals * 100
                }
        
        all_results['V4'] = v4_results
        print(f"‚úì V4: Voting ensemble (ATR-based labels)")
        
    except Exception as e:
        print(f"  V4 error: {e}")
        import traceback
        traceback.print_exc()

# V5
result = evaluate_model('v5', test_df_raw, add_features_v2_v3, ['xgboost', 'lightgbm', 'rf'])
if result:
    all_results['V5'] = result
    print(f"‚úì V5: {result['n_models']} models")

Evaluating V4, V5...
‚úì V4: Voting ensemble (ATR-based labels)
‚úì V5: 3 models


## 6. Evaluate V6, V7

In [16]:
print("="*70)
print("Evaluating V6, V7...")
print("="*70)

# V6 - Binary models
v6_dir = MODELS_DIR / 'signal_generator_v6'
if v6_dir.exists():
    try:
        scaler = joblib.load(v6_dir / 'scaler_v6_bin.joblib')
        feature_cols = joblib.load(v6_dir / 'feature_cols_v6.joblib')
        weights = joblib.load(v6_dir / 'weights_v6.joblib')
        
        test_prepared = add_features_v6_v7_v8_v9_v10(test_df_raw.copy())
        test_prepared = create_labels(test_prepared)
        test_binary = test_prepared[test_prepared['signal'] != -1].copy()
        
        for c in feature_cols:
            if c not in test_binary.columns:
                test_binary[c] = 0
        
        test_clean = test_binary.dropna(subset=feature_cols).copy()
        X_test = test_clean[feature_cols].values
        X_test_scaled = scaler.transform(X_test)
        y_test = test_clean['signal'].values
        
        v6_models = {}
        v6_preds = {}
        v6_proba = {}
        for name in ['xgb1', 'xgb2', 'lgb1', 'lgb2', 'cat']:
            path = v6_dir / f'{name}_v6_bin.joblib'
            if path.exists():
                v6_models[name] = joblib.load(path)
                v6_preds[name] = v6_models[name].predict(X_test_scaled)
                v6_proba[name] = v6_models[name].predict_proba(X_test_scaled)
        
        if v6_models:
            first = list(v6_models.keys())[0]
            final_proba = np.zeros_like(v6_proba[first])
            for name in v6_models.keys():
                w = weights.get(name, 1/len(v6_models))
                final_proba += w * v6_proba[name]
            
            buy_prob = final_proba[:, 1] * 100
            all_preds = np.array([v6_preds[name] for name in v6_models.keys()])
            buy_votes = np.sum(all_preds == 1, axis=0)
            all_agree = buy_votes == len(v6_models)
            most_agree = buy_votes >= (len(v6_models) - 1)
            
            confidence = buy_prob.copy()
            confidence[all_agree] = np.minimum(confidence[all_agree] + 5, 100)
            confidence[most_agree & ~all_agree] = np.minimum(confidence[most_agree & ~all_agree] + 2, 100)
            
            results = {'version': 'v6', 'n_models': len(v6_models), 'total_signals': len(y_test), 'thresholds': {}}
            for thresh in [50, 60, 70, 75, 80, 85, 90]:
                mask = confidence >= thresh
                if mask.sum() > 0:
                    signals = mask.sum()
                    correct = y_test[mask].sum()
                    results['thresholds'][thresh] = {'signals': int(signals), 'correct': int(correct), 'accuracy': correct/signals*100}
            
            all_results['V6'] = results
            print(f"‚úì V6: {len(v6_models)} models")
    except Exception as e:
        print(f"  V6 error: {e}")

# V7
result = evaluate_model('v7', test_df_raw, add_features_v6_v7_v8_v9_v10, ['xgb1', 'xgb2', 'lgb1', 'lgb2', 'cat'])
if result:
    all_results['V7'] = result
    print(f"‚úì V7: {result['n_models']} models")

Evaluating V6, V7...
‚úì V6: 5 models
‚úì V6: 5 models
‚úì V7: 5 models
‚úì V7: 5 models


## 7. Evaluate V8, V9

In [17]:
print("="*70)
print("Evaluating V8, V9...")
print("="*70)

# V8
result = evaluate_model('v8', test_df_raw, add_features_v6_v7_v8_v9_v10, ['xgb1', 'xgb2', 'lgb1', 'lgb2', 'cat'])
if result:
    all_results['V8'] = result
    print(f"‚úì V8: {result['n_models']} models")

# V9
result = evaluate_model('v9', test_df_raw, add_features_v6_v7_v8_v9_v10, ['xgb1', 'xgb2', 'lgb1', 'lgb2', 'cat'])
if result:
    all_results['V9'] = result
    print(f"‚úì V9: {result['n_models']} models")

Evaluating V8, V9...
‚úì V8: 5 models
‚úì V8: 5 models
‚úì V9: 5 models
‚úì V9: 5 models


## 8. Evaluate V10

In [18]:
print("="*70)
print("Evaluating V10...")
print("="*70)

v10_dir = MODELS_DIR / 'signal_generator_v10'
if v10_dir.exists():
    try:
        scaler = joblib.load(v10_dir / 'scaler_v10.joblib')
        feature_cols = joblib.load(v10_dir / 'feature_cols_v10.joblib')
        weights = joblib.load(v10_dir / 'weights_v10.joblib')
        
        test_prepared = add_features_v10_extended(test_df_raw.copy())
        test_prepared = create_labels(test_prepared)
        test_binary = test_prepared[test_prepared['signal'] != -1].copy()
        
        for c in feature_cols:
            if c not in test_binary.columns:
                test_binary[c] = 0
        
        test_clean = test_binary.dropna(subset=feature_cols).copy()
        X_test = test_clean[feature_cols].values
        X_test_scaled = scaler.transform(X_test)
        y_test = test_clean['signal'].values
        
        v10_models = {}
        v10_preds = {}
        v10_proba = {}
        for name in ['xgb1', 'xgb2', 'xgb3', 'lgb1', 'lgb2', 'cat1', 'cat2']:
            path = v10_dir / f'{name}_v10.joblib'
            if path.exists():
                v10_models[name] = joblib.load(path)
                v10_preds[name] = v10_models[name].predict(X_test_scaled)
                v10_proba[name] = v10_models[name].predict_proba(X_test_scaled)
        
        if v10_models:
            first = list(v10_models.keys())[0]
            final_proba = np.zeros_like(v10_proba[first])
            for name in v10_models.keys():
                w = weights.get(name, 1/len(v10_models))
                final_proba += w * v10_proba[name]
            
            buy_prob = final_proba[:, 1] * 100
            all_preds = np.array([v10_preds[name] for name in v10_models.keys()])
            buy_votes = np.sum(all_preds == 1, axis=0)
            
            # V10 uses 7 models
            all_agree = buy_votes == 7
            strong_agree = buy_votes >= 6
            most_agree = buy_votes >= 5
            
            confidence = buy_prob.copy()
            confidence[all_agree] = np.minimum(confidence[all_agree] + 7, 100)
            confidence[strong_agree & ~all_agree] = np.minimum(confidence[strong_agree & ~all_agree] + 4, 100)
            confidence[most_agree & ~strong_agree] = np.minimum(confidence[most_agree & ~strong_agree] + 2, 100)
            
            results = {'version': 'v10', 'n_models': len(v10_models), 'total_signals': len(y_test), 'thresholds': {}}
            for thresh in [50, 60, 70, 75, 80, 85, 90]:
                mask = confidence >= thresh
                if mask.sum() > 0:
                    signals = mask.sum()
                    correct = y_test[mask].sum()
                    results['thresholds'][thresh] = {'signals': int(signals), 'correct': int(correct), 'accuracy': correct/signals*100}
            
            all_results['V10'] = results
            print(f"‚úì V10: {len(v10_models)} models")
    except Exception as e:
        print(f"  V10 error: {e}")
        import traceback
        traceback.print_exc()

Evaluating V10...
‚úì V10: 7 models
‚úì V10: 7 models


## 9. üìä COMPARISON TABLE

In [19]:
print("="*100)
print("üìä ALL MODELS COMPARISON - 75% THRESHOLD")
print("="*100)

print(f"\n{'Model':>8} | {'Models':>6} | {'75%+ Sig':>10} | {'75%+ Acc':>10} | {'80%+ Sig':>10} | {'80%+ Acc':>10} | {'85%+ Sig':>10} | {'85%+ Acc':>10}")
print("-"*100)

comparison_data = []
for version, data in sorted(all_results.items()):
    row = {'version': version, 'n_models': data['n_models']}
    
    for thresh in [75, 80, 85]:
        if thresh in data['thresholds']:
            row[f'{thresh}_sig'] = data['thresholds'][thresh]['signals']
            row[f'{thresh}_acc'] = data['thresholds'][thresh]['accuracy']
        else:
            row[f'{thresh}_sig'] = 0
            row[f'{thresh}_acc'] = 0
    
    comparison_data.append(row)
    
    print(f"{version:>8} | {row['n_models']:>6} | {row['75_sig']:>10} | {row['75_acc']:>9.1f}% | {row['80_sig']:>10} | {row['80_acc']:>9.1f}% | {row['85_sig']:>10} | {row['85_acc']:>9.1f}%")

# Find best at each threshold
print("\n" + "="*100)

üìä ALL MODELS COMPARISON - 75% THRESHOLD

   Model | Models |   75%+ Sig |   75%+ Acc |   80%+ Sig |   80%+ Acc |   85%+ Sig |   85%+ Acc
----------------------------------------------------------------------------------------------------
     V10 |      7 |        826 |      60.7% |        255 |      71.8% |         64 |      96.9%
      V2 |      3 |         62 |      71.0% |          0 |       0.0% |          0 |       0.0%
      V3 |      3 |          0 |       0.0% |          0 |       0.0% |          0 |       0.0%
      V4 |      3 |          0 |       0.0% |          0 |       0.0% |          0 |       0.0%
      V5 |      3 |          0 |       0.0% |          0 |       0.0% |          0 |       0.0%
      V6 |      5 |       1391 |      56.9% |        409 |      57.9% |        104 |      63.5%
      V7 |      5 |       4243 |      50.3% |       1341 |      49.1% |        365 |      61.1%
      V8 |      5 |       4649 |      56.2% |       1299 |      54.7% |        298 |   

## 10. üèÜ BEST MODEL ANALYSIS

In [20]:
print("="*100)
print("üèÜ BEST MODEL ANALYSIS")
print("="*100)

# Find best at different thresholds
for thresh in [75, 80, 85, 90]:
    best_acc = 0
    best_model = None
    best_signals = 0
    
    for version, data in all_results.items():
        if thresh in data['thresholds']:
            acc = data['thresholds'][thresh]['accuracy']
            sig = data['thresholds'][thresh]['signals']
            if acc > best_acc and sig >= 10:  # Minimum 10 signals for reliability
                best_acc = acc
                best_model = version
                best_signals = sig
    
    if best_model:
        print(f"\nü•á Best at {thresh}%+: {best_model}")
        print(f"   Accuracy: {best_acc:.1f}%")
        print(f"   Signals: {best_signals}")

üèÜ BEST MODEL ANALYSIS

ü•á Best at 75%+: V2
   Accuracy: 71.0%
   Signals: 62

ü•á Best at 80%+: V10
   Accuracy: 71.8%
   Signals: 255

ü•á Best at 85%+: V10
   Accuracy: 96.9%
   Signals: 64

ü•á Best at 90%+: V10
   Accuracy: 100.0%
   Signals: 17


## 11. üìà DETAILED COMPARISON CHART

In [21]:
print("="*100)
print("üìà DETAILED ACCURACY BY THRESHOLD")
print("="*100)

# Create comparison dataframe
thresholds = [50, 60, 70, 75, 80, 85, 90]

print(f"\n{'Threshold':>10}", end="")
for version in sorted(all_results.keys()):
    print(f" | {version:>8}", end="")
print("")
print("-"*(12 + 11*len(all_results)))

for thresh in thresholds:
    print(f"{thresh:>8}%+", end="")
    for version in sorted(all_results.keys()):
        if thresh in all_results[version]['thresholds']:
            acc = all_results[version]['thresholds'][thresh]['accuracy']
            print(f" | {acc:>7.1f}%", end="")
        else:
            print(f" |      N/A", end="")
    print("")

print("\n" + "="*100)
print("SIGNAL COUNTS BY THRESHOLD")
print("="*100)

print(f"\n{'Threshold':>10}", end="")
for version in sorted(all_results.keys()):
    print(f" | {version:>8}", end="")
print("")
print("-"*(12 + 11*len(all_results)))

for thresh in thresholds:
    print(f"{thresh:>8}%+", end="")
    for version in sorted(all_results.keys()):
        if thresh in all_results[version]['thresholds']:
            sig = all_results[version]['thresholds'][thresh]['signals']
            print(f" | {sig:>8}", end="")
        else:
            print(f" |      N/A", end="")
    print("")

üìà DETAILED ACCURACY BY THRESHOLD

 Threshold |      V10 |       V2 |       V3 |       V4 |       V5 |       V6 |       V7 |       V8 |       V9
---------------------------------------------------------------------------------------------------------------
      50%+ |    52.5% |    52.0% |      N/A |    48.0% |      N/A |    51.7% |    54.1% |    51.8% |    50.5%
      60%+ |    53.6% |    54.4% |      N/A |    56.7% |      N/A |    52.4% |    54.0% |    51.9% |    51.3%
      70%+ |    57.0% |    55.3% |      N/A |   100.0% |      N/A |    55.3% |    50.9% |    53.3% |    59.1%
      75%+ |    60.7% |    71.0% |      N/A |      N/A |      N/A |    56.9% |    50.3% |    56.2% |    60.9%
      80%+ |    71.8% |      N/A |      N/A |      N/A |      N/A |    57.9% |    49.1% |    54.7% |    46.7%
      85%+ |    96.9% |      N/A |      N/A |      N/A |      N/A |    63.5% |    61.1% |    68.8% |    38.1%
      90%+ |   100.0% |      N/A |      N/A |      N/A |      N/A |    76.2% |   

## 12. üéØ FINAL RECOMMENDATION

In [22]:
print("="*100)
print("üéØ FINAL RECOMMENDATION")
print("="*100)

# Score each model based on accuracy at key thresholds
scores = {}
for version, data in all_results.items():
    score = 0
    
    # 75% threshold - weight 3
    if 75 in data['thresholds'] and data['thresholds'][75]['signals'] >= 100:
        score += data['thresholds'][75]['accuracy'] * 3
    
    # 80% threshold - weight 4
    if 80 in data['thresholds'] and data['thresholds'][80]['signals'] >= 50:
        score += data['thresholds'][80]['accuracy'] * 4
    
    # 85% threshold - weight 5
    if 85 in data['thresholds'] and data['thresholds'][85]['signals'] >= 20:
        score += data['thresholds'][85]['accuracy'] * 5
    
    scores[version] = score

# Sort by score
sorted_scores = sorted(scores.items(), key=lambda x: -x[1])

print("\nüìä MODEL RANKINGS (weighted score):")
print("-"*50)
for i, (version, score) in enumerate(sorted_scores):
    if score > 0:
        medal = "ü•á" if i == 0 else "ü•à" if i == 1 else "ü•â" if i == 2 else "  "
        print(f"{medal} {i+1}. {version}: {score:.1f} points")

# Best model
if sorted_scores:
    best = sorted_scores[0][0]
    print(f"\n" + "="*100)
    print(f"üèÜ RECOMMENDED MODEL: {best}")
    print("="*100)
    
    if best in all_results:
        data = all_results[best]
        print(f"\nüìã {best} DETAILS:")
        print(f"   Models in ensemble: {data['n_models']}")
        print(f"\n   Performance:")
        for thresh in [75, 80, 85, 90]:
            if thresh in data['thresholds']:
                t = data['thresholds'][thresh]
                print(f"   {thresh}%+: {t['accuracy']:.1f}% accuracy, {t['signals']} signals")

üéØ FINAL RECOMMENDATION

üìä MODEL RANKINGS (weighted score):
--------------------------------------------------
ü•á 1. V10: 953.4 points
ü•à 2. V8: 731.2 points
ü•â 3. V6: 719.7 points
   4. V7: 653.1 points
   5. V9: 560.2 points

üèÜ RECOMMENDED MODEL: V10

üìã V10 DETAILS:
   Models in ensemble: 7

   Performance:
   75%+: 60.7% accuracy, 826 signals
   80%+: 71.8% accuracy, 255 signals
   85%+: 96.9% accuracy, 64 signals
   90%+: 100.0% accuracy, 17 signals


## 13. Summary Table

In [23]:
# Create summary DataFrame
summary_data = []
for version, data in sorted(all_results.items()):
    row = {
        'Model': version,
        'Ensemble Size': data['n_models'],
        '75%+ Signals': data['thresholds'].get(75, {}).get('signals', 0),
        '75%+ Accuracy': data['thresholds'].get(75, {}).get('accuracy', 0),
        '80%+ Signals': data['thresholds'].get(80, {}).get('signals', 0),
        '80%+ Accuracy': data['thresholds'].get(80, {}).get('accuracy', 0),
        '85%+ Signals': data['thresholds'].get(85, {}).get('signals', 0),
        '85%+ Accuracy': data['thresholds'].get(85, {}).get('accuracy', 0),
    }
    summary_data.append(row)

summary_df = pd.DataFrame(summary_data)
print("\nüìä SUMMARY TABLE:")
print(summary_df.to_string(index=False))


üìä SUMMARY TABLE:
Model  Ensemble Size  75%+ Signals  75%+ Accuracy  80%+ Signals  80%+ Accuracy  85%+ Signals  85%+ Accuracy
  V10              7           826      60.653753           255      71.764706            64      96.875000
   V2              3            62      70.967742             0       0.000000             0       0.000000
   V3              3             0       0.000000             0       0.000000             0       0.000000
   V4              3             0       0.000000             0       0.000000             0       0.000000
   V5              3             0       0.000000             0       0.000000             0       0.000000
   V6              5          1391      56.865564           409      57.946210           104      63.461538
   V7              5          4243      50.341739          1341      49.142431           365      61.095890
   V8              5          4649      56.205636          1299      54.657429           298      68.791946
   V9  