In [4]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb
import joblib

# GPU check
import torch
GPU_AVAILABLE = torch.cuda.is_available()

# Paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / 'data'
MODEL_DIR = BASE_DIR / 'models' / 'signal_generator_v4'
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("="*60)
print("ðŸš€ FOREX SIGNAL GENERATOR V4 - DYNAMIC ATR & STACKING")
print("="*60)
print(f"âœ“ Libraries loaded")
print(f"âœ“ GPU Available: {GPU_AVAILABLE}")
print(f"âœ“ Models Directory: {MODEL_DIR}")

ðŸš€ FOREX SIGNAL GENERATOR V4 - DYNAMIC ATR & STACKING
âœ“ Libraries loaded
âœ“ GPU Available: True
âœ“ Models Directory: c:\Users\Acer\Desktop\Forex-Signal-App\models\signal_generator_v4


## 1. Data Loading & Feature Engineering

In [6]:
# Load data
train_df = pd.read_csv(DATA_DIR / 'EUR_USD_1min.csv')
test_df = pd.read_csv(DATA_DIR / 'EUR_USD_test.csv')

# Rename timestamp to time if needed
if 'timestamp' in train_df.columns:
    train_df.rename(columns={'timestamp': 'time'}, inplace=True)
if 'timestamp' in test_df.columns:
    test_df.rename(columns={'timestamp': 'time'}, inplace=True)

# Convert time to datetime
train_df['time'] = pd.to_datetime(train_df['time'])
test_df['time'] = pd.to_datetime(test_df['time'])

print(f"Train data: {len(train_df):,} rows")
print(f"Test data: {len(test_df):,} rows")

Train data: 1,859,492 rows
Test data: 296,778 rows


In [16]:
def add_features_v4(df):
    """Add technical indicators + Time features for V4."""
    df = df.copy()
    
    # --- 1. Time Features (NEW in v4) ---
    # Forex sessions matter. London/NY overlap is most volatile.
    df['hour'] = df['time'].dt.hour
    df['day_of_week'] = df['time'].dt.dayofweek
    
    # Session flags (Approximate UTC)
    # London: 07:00 - 16:00 UTC
    # NY: 12:00 - 21:00 UTC
    df['is_london'] = ((df['hour'] >= 7) & (df['hour'] < 16)).astype(int)
    df['is_ny'] = ((df['hour'] >= 12) & (df['hour'] < 21)).astype(int)
    df['is_overlap'] = ((df['hour'] >= 12) & (df['hour'] < 16)).astype(int)
    
    # --- 2. Standard Indicators ---
    # Moving Averages
    for period in [10, 20, 50, 200]:
        df[f'sma_{period}'] = df['close'].rolling(period).mean()
        df[f'ema_{period}'] = df['close'].ewm(span=period, adjust=False).mean()
    
    # RSI
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / (loss + 1e-10)
    df['rsi_14'] = 100 - (100 / (1 + rs))
    
    # MACD
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # ATR (Crucial for V4 Dynamic Labeling)
    high_low = df['high'] - df['low']
    high_close = abs(df['high'] - df['close'].shift())
    low_close = abs(df['low'] - df['close'].shift())
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    df['atr_14'] = tr.rolling(14).mean()
    
    # Bollinger Bands
    df['bb_middle'] = df['close'].rolling(20).mean()
    bb_std = df['close'].rolling(20).std()
    df['bb_upper'] = df['bb_middle'] + 2 * bb_std
    df['bb_lower'] = df['bb_middle'] - 2 * bb_std
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
    
    # --- 3. Advanced Features (NEW in v4) ---
    # Distance from MA (Mean Reversion)
    df['dist_sma50'] = (df['close'] - df['sma_50']) / df['sma_50'] * 100
    df['dist_sma200'] = (df['close'] - df['sma_200']) / df['sma_200'] * 100
    
    # Volatility Ratio (Current ATR vs Long term ATR)
    df['atr_50'] = tr.rolling(50).mean()
    df['volatility_ratio'] = df['atr_14'] / (df['atr_50'] + 1e-10)
    
    # Momentum
    df['mom_10'] = df['close'].pct_change(10)
    df['mom_30'] = df['close'].pct_change(30)

    # --- 4. Lag Features (NEW) ---
    # Capture trends by looking at previous values
    for col in ['rsi_14', 'macd', 'mom_10', 'volatility_ratio']:
        df[f'{col}_lag1'] = df[col].shift(1)
        df[f'{col}_lag2'] = df[col].shift(2)
        df[f'{col}_lag3'] = df[col].shift(3)
    
    return df

print("Adding V4 features (with Lags)...")
train_df = add_features_v4(train_df)
test_df = add_features_v4(test_df)
print(f"âœ“ Features added: {len(train_df.columns)}")

Adding V4 features (with Lags)...
âœ“ Features added: 47
âœ“ Features added: 47


## 2. Dynamic Labeling (ATR-Based)

**V4 Innovation:**
Instead of fixed 15 pips, we use **ATR-based targets**.
- **Target:** Price must rise by `2.0 * ATR` (Take Profit)
- **Stop:** Price must NOT fall by `1.5 * ATR` (Stop Loss)
- This adapts to market volatility. In quiet markets, targets are smaller. In volatile markets, targets are larger.

In [17]:
def create_dynamic_labels(df, forward_periods=60):
    """
    V4 Labeling: Dynamic ATR-based targets.
    TP = 1.5 * ATR
    SL = 1.0 * ATR
    Risk:Reward = 1:1.5
    """
    df = df.copy()
    print(f"Creating Dynamic Labels (TP=1.5*ATR, SL=1.0*ATR)...")
    
    labels = []
    
    # Vectorized approach for speed
    # We'll iterate but optimize
    closes = df['close'].values
    highs = df['high'].values
    lows = df['low'].values
    atrs = df['atr_14'].values
    
    n = len(df)
    
    for i in range(n):
        if i + forward_periods >= n or np.isnan(atrs[i]):
            labels.append(0)
            continue
            
        entry = closes[i]
        atr = atrs[i]
        
        # Adjusted Targets for better hit rate
        tp_dist = atr * 1.5
        sl_dist = atr * 1.0
        
        tp_price = entry + tp_dist
        sl_price = entry - sl_dist
        
        # Look forward
        future_highs = highs[i+1 : i+forward_periods+1]
        future_lows = lows[i+1 : i+forward_periods+1]
        
        # Check if SL hit first
        sl_hit_indices = np.where(future_lows <= sl_price)[0]
        first_sl_idx = sl_hit_indices[0] if len(sl_hit_indices) > 0 else forward_periods + 1
        
        # Check if TP hit first
        tp_hit_indices = np.where(future_highs >= tp_price)[0]
        first_tp_idx = tp_hit_indices[0] if len(tp_hit_indices) > 0 else forward_periods + 1
        
        if first_tp_idx < first_sl_idx:
            labels.append(1) # WIN
        else:
            labels.append(0) # LOSS or TIMEOUT
            
    df['target'] = labels
    return df

train_df = create_dynamic_labels(train_df)
test_df = create_dynamic_labels(test_df)

print(f"\nLabel Distribution (Train):")
print(train_df['target'].value_counts(normalize=True))

Creating Dynamic Labels (TP=1.5*ATR, SL=1.0*ATR)...
Creating Dynamic Labels (TP=1.5*ATR, SL=1.0*ATR)...
Creating Dynamic Labels (TP=1.5*ATR, SL=1.0*ATR)...

Label Distribution (Train):
target
0    0.587238
1    0.412762
Name: proportion, dtype: float64

Label Distribution (Train):
target
0    0.587238
1    0.412762
Name: proportion, dtype: float64


## 3. Feature Selection (Noise Reduction)

**V4 Innovation:**
Instead of using all features, we select the most important ones using a Random Forest selector. This reduces overfitting and noise.

In [18]:
# Prepare Data
exclude_cols = ['time', 'target', 'open', 'high', 'low', 'close', 'tick_volume', 'volume']
feature_cols = [c for c in train_df.columns if c not in exclude_cols]

train_clean = train_df.dropna()
test_clean = test_df.dropna()

X_train_raw = train_clean[feature_cols]
y_train = train_clean['target']
X_test_raw = test_clean[feature_cols]
y_test = test_clean['target']

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

# Feature Selection
print("Selecting best features...")
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1), threshold="median")
selector.fit(X_train_scaled, y_train)

selected_indices = selector.get_support(indices=True)
selected_features = [feature_cols[i] for i in selected_indices]

print(f"Original features: {len(feature_cols)}")
print(f"Selected features: {len(selected_features)}")
print("Top features:", selected_features[:10])

# Transform X
X_train = selector.transform(X_train_scaled)
X_test = selector.transform(X_test_scaled)

Selecting best features...
Original features: 40
Selected features: 20
Top features: ['rsi_14', 'macd_signal', 'macd_hist', 'atr_14', 'bb_width', 'dist_sma50', 'dist_sma200', 'atr_50', 'volatility_ratio', 'mom_10']


## 4. Stacking Ensemble Training

**V4 Innovation:**
Using `StackingClassifier`. 
- **Base Models:** XGBoost, LightGBM, Random Forest
- **Meta Model:** Logistic Regression (learns how to combine base models optimally)

In [19]:
from sklearn.ensemble import VotingClassifier

# Base Models - Increased Regularization & Reduced Complexity
xgb_clf = xgb.XGBClassifier(
    n_estimators=500, 
    max_depth=6, 
    learning_rate=0.05, 
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=0.5,
    random_state=42,
    n_jobs=-1,
    device='cuda' if GPU_AVAILABLE else 'cpu'
)

lgb_clf = lgb.LGBMClassifier(
    n_estimators=500, 
    max_depth=6, 
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=0.5,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

rf_clf = RandomForestClassifier(
    n_estimators=300, 
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
    n_jobs=-1
)

# Voting Classifier (Soft Voting)
# We use Soft Voting to average the probabilities. This is often more robust than Stacking for noisy data.
print("Training Soft Voting Ensemble (Regularized)...")
voting_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lgb', lgb_clf),
        ('rf', rf_clf)
    ],
    voting='soft',
    n_jobs=-1
)

voting_model.fit(X_train, y_train)

# Evaluate
y_pred = voting_model.predict(X_test)
y_proba = voting_model.predict_proba(X_test)[:, 1]

print("\n" + "="*40)
print("V4 VOTING RESULTS (IMPROVED)")
print("="*40)
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")

# Probability Distribution Check
print("\nProbability Distribution:")
print(f"Min: {y_proba.min():.4f}")
print(f"Max: {y_proba.max():.4f}")
print(f"Mean: {y_proba.mean():.4f}")
print(f"Std: {y_proba.std():.4f}")

# Check how many signals at different thresholds
for th in [0.5, 0.6, 0.7, 0.8]:
    count = (y_proba >= th).sum()
    print(f"Signals >= {th}: {count}")

Training Soft Voting Ensemble (Regularized)...

V4 VOTING RESULTS (IMPROVED)
Accuracy:  0.5882
Precision: 0.4800
Recall:    0.0038
F1 Score:  0.0076

Probability Distribution:
Min: 0.1962
Max: 0.7201
Mean: 0.4103
Std: 0.0218
Signals >= 0.5: 977
Signals >= 0.6: 67
Signals >= 0.7: 2
Signals >= 0.8: 0

V4 VOTING RESULTS (IMPROVED)
Accuracy:  0.5882
Precision: 0.4800
Recall:    0.0038
F1 Score:  0.0076

Probability Distribution:
Min: 0.1962
Max: 0.7201
Mean: 0.4103
Std: 0.0218
Signals >= 0.5: 977
Signals >= 0.6: 67
Signals >= 0.7: 2
Signals >= 0.8: 0


## 5. Backtest with Dynamic SL/TP

Since we trained with Dynamic Labels, the model should perform much better on the Dynamic SL/TP backtest.

In [20]:
def backtest_v4(df, probabilities, threshold=0.80):
    """Backtest with Dynamic SL/TP (1.0 SL, 1.5 TP)"""
    results = []
    
    # Filter by threshold
    indices = np.where(probabilities >= threshold)[0]
    
    print(f"Backtesting {len(indices)} signals (Conf >= {threshold*100:.0f}%)...")
    
    for idx in indices:
        if idx + 60 >= len(df): continue
        
        entry = df['close'].iloc[idx]
        atr = df['atr_14'].iloc[idx]
        
        # Dynamic SL/TP (Same as training logic)
        sl_pips = (atr * 1.0) * 10000
        tp_pips = (atr * 1.5) * 10000
        
        # Cap min/max for safety
        sl_pips = max(5, min(20, sl_pips))
        tp_pips = max(8, min(40, tp_pips))
        
        sl_price = entry - (sl_pips * 0.0001)
        tp_price = entry + (tp_pips * 0.0001)
        
        # Check outcome
        future = df.iloc[idx+1 : idx+61]
        result = 'TIMEOUT'
        pnl = 0
        
        for _, row in future.iterrows():
            if row['low'] <= sl_price:
                result = 'LOSS'
                pnl = -sl_pips
                break
            if row['high'] >= tp_price:
                result = 'WIN'
                pnl = tp_pips
                break
        
        if result == 'TIMEOUT':
            end_price = future['close'].iloc[-1]
            pnl = (end_price - entry) * 10000
            # result = 'WIN' if pnl > 0 else 'LOSS'
            
        results.append(pnl)
        
    return results

# Run Backtest
for conf in [0.60, 0.65, 0.70, 0.75, 0.80]:
    pnls = backtest_v4(test_clean, y_proba, threshold=conf)
    if len(pnls) > 0:
        wins = len([p for p in pnls if p > 0])
        total = len(pnls)
        wr = wins / total * 100
        total_pips = sum(pnls)
        
        # Profit Factor
        gross_profit = sum([p for p in pnls if p > 0])
        gross_loss = abs(sum([p for p in pnls if p < 0]))
        pf = gross_profit / gross_loss if gross_loss > 0 else 0
        
        print(f"Conf >= {conf*100:.0f}%: {total} signals | WR: {wr:.1f}% | Pips: {total_pips:.1f} | PF: {pf:.2f}")

Backtesting 67 signals (Conf >= 60%)...
Conf >= 60%: 67 signals | WR: 61.2% | Pips: 51.3 | PF: 1.20
Backtesting 12 signals (Conf >= 65%)...
Conf >= 65%: 12 signals | WR: 75.0% | Pips: 46.4 | PF: 3.43
Backtesting 2 signals (Conf >= 70%)...
Conf >= 70%: 2 signals | WR: 100.0% | Pips: 16.0 | PF: 0.00
Backtesting 0 signals (Conf >= 75%)...
Backtesting 0 signals (Conf >= 80%)...


In [21]:
# Save V4
print("Saving V4 models...")
joblib.dump(voting_model, MODEL_DIR / 'voting_v4.joblib') # Changed from stacking to voting
joblib.dump(scaler, MODEL_DIR / 'scaler_v4.joblib')
joblib.dump(selector, MODEL_DIR / 'selector_v4.joblib')
joblib.dump(selected_features, MODEL_DIR / 'features_v4.joblib')

print("âœ… V4 Models saved successfully!")

Saving V4 models...
âœ… V4 Models saved successfully!
âœ… V4 Models saved successfully!
