# Master Trading Notebook: GBP/USD M15 Project

## Context and Objectives

This master notebook consolidates the entire workflow of the GBP/USD trading project, from raw data ingestion to machine learning model evaluation. It unifies the following steps:

1.  **Data Import (T01)**: Loading and auditing raw M1 data (2022-2024).
2.  **Aggregation (T02)**: Transforming M1 ticks into 15-minute (M15) candles.
3.  **Cleaning (T03)**: Filtering incomplete candles and ensuring data integrity.
4.  **Feature Engineering (T05)**: Creating technical indicators (RSI, EMAs, ATR, etc.).
5.  **Machine Learning (T07)**: Training, validating, and backtesting predictive models.

---

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from pathlib import Path
from scipy import stats as sp_stats
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score
import pickle
import json
from datetime import datetime

# Graphics Configuration
sns.set_theme(style="whitegrid")
plt.rcParams.update({
    "figure.facecolor": "#FAF0E6",
    "axes.facecolor": "#F5F5DC",
    "grid.color": "#E0D0C0",
    "text.color": "#5D4037",
    "axes.labelcolor": "#5D4037",
    "xtick.color": "#5D4037",
    "ytick.color": "#5D4037",
    "axes.prop_cycle": plt.cycler(color=['#8D6E63', '#A1887F', '#D7CCC8'])
})

# Constants and Paths
PROJECT_ROOT = Path('.').resolve()
DATA_DIR = PROJECT_ROOT / "data"
M15_DIR = DATA_DIR / "m15"
CLEAN_DIR = M15_DIR / "clean"
FEATURES_DIR = DATA_DIR / "features"
MODELS_DIR = PROJECT_ROOT / "models" / "v1"

for d in [DATA_DIR, M15_DIR, CLEAN_DIR, FEATURES_DIR, MODELS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

YEARS = [2022, 2023, 2024]
LABELS = {2022: 'Train', 2023: 'Validation', 2024: 'Test'}
COLORS = {2022: '#8D6E63', 2023: '#A1887F', 2024: '#BCAAA4'}

## 2. T01: Import M1 Data
Loading raw 1-minute data from CSV files.

In [None]:
def load_m1_data(year):
    # Attempt to locate the file in probable locations
    filename = f"DAT_MT_GBPUSD_M1_{year}.csv"
    possible_paths = [
        DATA_DIR / filename,
        DATA_DIR / f"HISTDATA_COM_MT_GBPUSD_M1{year}" / filename,
        PROJECT_ROOT / f"HISTDATA_COM_MT_GBPUSD_M1{year}" / filename
    ]
    
    path = None
    for p in possible_paths:
        if p.exists():
            path = p
            break
            
    if path is None:
        print(f"[ERROR] File for {year} not found.")
        return None
        
    print(f"Loading {year} from {path}...")
    # Standard M1 CSV format: Date, Time, Open, High, Low, Close, Volume
    df = pd.read_csv(path, names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'])
    
    # Timestamp creation
    df['timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.set_index('timestamp', inplace=True)
    df.drop(['date', 'time'], axis=1, inplace=True)
    
    return df

dfs_m1 = {}
for year in YEARS:
    df = load_m1_data(year)
    if df is not None:
        dfs_m1[year] = df
        print(f" -> Loaded {len(df):,} M1 rows.")

## 3. T02: Aggregation M1 -> M15
Converting high-frequency 1-minute data into 15-minute candles to reduce noise and align with the trading strategy.

In [None]:
def aggregate_m1_to_m15(df_m1):
    if df_m1 is None or df_m1.empty:
        return None
        
    # Resampling rules
    # Open: first, High: max, Low: min, Close: last, Volume: sum
    # Tick count: count of M1 bars contributing to the M15 bar
    agg_dict = {
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum'
    }
    
    # Count ticks before full aggregation to ensure alignment
    ticks = df_m1['close'].resample('15T').count()
    
    # Main Aggregation
    df_m15 = df_m1.resample('15T').agg(agg_dict)
    df_m15['tick_count'] = ticks
    
    # Drop rows where no data exists (e.g. gaps in resampling)
    df_m15.dropna(inplace=True)
    
    # Rename columns
    df_m15.rename(columns={
        'open': 'open_15m',
        'high': 'high_15m',
        'low': 'low_15m',
        'close': 'close_15m'
    }, inplace=True)
    
    return df_m15

dfs_m15 = {}
for year in YEARS:
    if year in dfs_m1:
        print(f"Aggregating {year}...")
        df_m15 = aggregate_m1_to_m15(dfs_m1[year])
        dfs_m15[year] = df_m15
        
        # Save raw M15
        out_path = M15_DIR / f"GBPUSD_M15_{year}.csv"
        df_m15.to_csv(out_path)
        print(f" -> Created {len(df_m15):,} M15 candles. Saved to {out_path}")

## 4. T03: Cleaning Data
Filtering low-liquidity candles (tick_count < 5) and checking for OHLC consistency.

In [None]:
def clean_data(df, year):
    init_len = len(df)
    
    # 1. Filter incomplete candles
    # A proper 15m candle should have at least 5 minutes of activity
    df_clean = df[df['tick_count'] >= 5].copy()
    dropped_ticks = init_len - len(df_clean)
    
    # 2. OHLC Consistency Check
    mask_coherence = df_clean['high_15m'] >= df_clean['low_15m']
    df_clean = df_clean[mask_coherence]
    dropped_coherence = (init_len - dropped_ticks) - len(df_clean)

    print(f"[{year}] Cleaned. Dropped: {dropped_ticks} (low ticks), {dropped_coherence} (incoherent)")
    return df_clean

cleaned_dfs = {}
for year in YEARS:
    if year in dfs_m15:
        df_clean = clean_data(dfs_m15[year], year)
        cleaned_dfs[year] = df_clean
        
        out_path = CLEAN_DIR / f"GBPUSD_M15_{year}_clean.csv"
        df_clean.to_csv(out_path)
        print(f" -> Saved clean data to {out_path}")

## 5. T05: Feature Engineering
Adding technical indicators: RSI, ATR, ADX, EMAs, Returns.

In [None]:
# --- Indicator Functions ---

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).ewm(alpha=1/period, adjust=False).mean()
    loss = (-delta.where(delta < 0, 0)).ewm(alpha=1/period, adjust=False).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_atr(df, period=14):
    high = df['high_15m']
    low = df['low_15m']
    close = df['close_15m']
    prev_close = close.shift(1)
    
    tr1 = high - low
    tr2 = (high - prev_close).abs()
    tr3 = (low - prev_close).abs()
    
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.ewm(alpha=1/period, adjust=False).mean()
    return atr

def calculate_adx(df, period=14):
    high = df['high_15m']
    low = df['low_15m']
    close = df['close_15m']
    
    plus_dm = high.diff()
    minus_dm = low.diff()
    plus_dm[plus_dm < 0] = 0
    minus_dm[minus_dm > 0] = 0
    
    atr = calculate_atr(df, period)
    
    plus_di = 100 * (plus_dm.ewm(alpha=1/period).mean() / atr)
    minus_di = 100 * (minus_dm.abs().ewm(alpha=1/period).mean() / atr)
    
    dx = (abs(plus_di - minus_di) / (plus_di + minus_di)) * 100
    adx = dx.ewm(alpha=1/period).mean()
    return adx

def calculate_macd(series, fast=12, slow=26, signal=9):
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, signal_line

def generate_features(df):
    df_feat = df.copy()
    close = df_feat['close_15m']
    
    # Returns
    df_feat['return_1'] = close.pct_change(1)
    df_feat['return_4'] = close.pct_change(4)
    
    # EMAs
    df_feat['ema_20'] = close.ewm(span=20, adjust=False).mean()
    df_feat['ema_50'] = close.ewm(span=50, adjust=False).mean()
    df_feat['ema_diff'] = df_feat['ema_20'] - df_feat['ema_50']
    
    # RSI
    df_feat['rsi_14'] = calculate_rsi(close, 14)
    
    # Volatility
    df_feat['rolling_std_20'] = close.rolling(window=20).std()
    
    # Candle Features
    df_feat['range_15m'] = df_feat['high_15m'] - df_feat['low_15m']
    df_feat['body'] = (df_feat['close_15m'] - df_feat['open_15m']).abs()
    df_feat['upper_wick'] = df_feat['high_15m'] - df_feat[['open_15m', 'close_15m']].max(axis=1)
    df_feat['lower_wick'] = df_feat[['open_15m', 'close_15m']].min(axis=1) - df_feat['low_15m']
    
    # Trend / Context
    df_feat['ema_200'] = close.ewm(span=200, adjust=False).mean()
    df_feat['distance_to_ema200'] = close - df_feat['ema_200']
    df_feat['slope_ema50'] = df_feat['ema_50'].diff(3)
    
    df_feat['atr_14'] = calculate_atr(df_feat, 14)
    df_feat['rolling_std_100'] = close.rolling(window=100).std()
    df_feat['volatility_ratio'] = df_feat['rolling_std_20'] / df_feat['rolling_std_100']
    
    df_feat['adx_14'] = calculate_adx(df_feat, 14)
    df_feat['macd'], df_feat['macd_signal'] = calculate_macd(close)
    
    # Drop warmup NaNs
    df_feat.dropna(inplace=True)
    
    return df_feat

In [None]:
feature_sets = {}
for year in YEARS:
    if year in cleaned_dfs:
        print(f"Generating features for {year}...")
        df_feat = generate_features(cleaned_dfs[year])
        feature_sets[year] = df_feat
        
        out_path = FEATURES_DIR / f"GBPUSD_M15_{year}_features.csv"
        df_feat.to_csv(out_path)
        print(f" -> Saved {len(df_feat)} rows to {out_path}")

## 6. T07: Machine Learning
Training and evaluating a Random Forest model on the prepared data.

In [None]:
def prepare_modeling_data(df):
    # Target: Next Close > Current Close
    df = df.copy()
    df['target_return'] = df['close_15m'].shift(-1) - df['close_15m']
    df['target'] = (df['target_return'] > 0).astype(int)
    df.dropna(inplace=True)
    return df

if 2022 in feature_sets and 2023 in feature_sets and 2024 in feature_sets:
    df_train = prepare_modeling_data(feature_sets[2022])
    df_val = prepare_modeling_data(feature_sets[2023])
    df_test = prepare_modeling_data(feature_sets[2024])
    
    # Feature selection
    drop_cols = ['target', 'target_return', 'open_15m', 'high_15m', 'low_15m', 'close_15m', 'volume', 'volume_15m', 'tick_count']
    features_cols = [c for c in df_train.columns if c not in drop_cols]
    
    X_train = df_train[features_cols]
    y_train = df_train['target']
    
    X_val = df_val[features_cols]
    y_val = df_val['target']
    
    X_test = df_test[features_cols]
    y_test = df_test['target']
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"Training Data Shape: {X_train.shape}")
    
    # Model Training
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight='balanced')
    print("Training Random Forest...")
    model.fit(X_train_scaled, y_train)
    
    # Evaluation
    val_pred = model.predict(X_val_scaled)
    acc_val = accuracy_score(y_val, val_pred)
    print(f"Validation Accuracy: {acc_val:.4f}")
    print(classification_report(y_val, val_pred))
    
    # Save Model
    with open(MODELS_DIR / "rf_model.pkl", "wb") as f:
        pickle.dump(model, f)
    with open(MODELS_DIR / "scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    print("Model saved.")
    
    # Backtest Visualization
    preds_test = model.predict(X_test_scaled)
    signals = np.where(preds_test == 1, 1, -1)
    strategy_returns = signals * df_test['target_return']
    cumulative_returns = strategy_returns.cumsum()
    
    plt.figure(figsize=(12, 6))
    plt.plot(cumulative_returns.index, cumulative_returns, label='Strategy (RF)', color='#8D6E63')
    plt.plot(df_test.index, df_test['target_return'].cumsum(), label='Buy & Hold', color='gray', alpha=0.5, linestyle='--')
    plt.title("Backtest Result: 2024 Test Set", fontweight='bold')
    plt.ylabel("Cumulative PnL")
    plt.legend()
    plt.show()
else:
    print("Can't proceed with ML: Missing data for one of the years.")