# Crypto Tick-Level Micro-Prediction and Backtest Pipeline
# Expert-level implementation for scalping strategy development

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# SHAP for model explainability
import shap

# Progress bars and utilities
from tqdm import tqdm
import gzip
import os
from pathlib import Path

# Plotting setup
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📦 All libraries imported successfully")

# =============================================================================
# 📋 CONFIGURATION CELL - MODIFY THESE PARAMETERS
# =============================================================================

In [None]:
CONFIG = {
    # Data paths - modify according to your local setup
    'DATA_PATH': 'DATA',  # Change this to your actual file path
    'OUTPUT_DIR': './results/',
    
    # Time parameters
    'CANDLE_INTERVAL': '1T',  # 1-minute candles
    'TIMEZONE': 'UTC',
    
    # Feature engineering parameters
    'IMBALANCE_WINDOW': 10,  # seconds for tick imbalance calculation
    'VOLATILITY_WINDOW': 5,  # minutes for rolling volatility
    
    # ML parameters
    'TEST_SIZE': 0.1,  # 10% for testing
    'CV_FOLDS': 5,
    'RANDOM_STATE': 42,
    
    # Backtest parameters
    'INITIAL_CAPITAL': 10000,  # USD
    'POSITION_SIZE': 1000,     # USD per trade
}

# Create output directory
Path(CONFIG['OUTPUT_DIR']).mkdir(parents=True, exist_ok=True)

print("⚙️ Configuration loaded")
print(f"📁 Data path: {CONFIG['DATA_PATH']}")
print(f"💾 Output directory: {CONFIG['OUTPUT_DIR']}")

# =============================================================================
# 📁 1. DATA LOAD & PREPROCESSING
# =============================================================================

In [None]:
def load_tick_data(file_path):
    """
    Load tick data from csv.gz or parquet file
    Expected columns: timestamp, price, amount, side
    """
    print(f"📥 Loading tick data from: {file_path}")
    
    # Determine file type and load accordingly
    if file_path.endswith('.parquet'):
        df = pd.read_parquet(file_path)
    elif file_path.endswith('.csv.gz'):
        df = pd.read_csv(file_path, compression='gzip')
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Use .csv, .csv.gz, or .parquet")
    
    print(f"✅ Loaded {len(df):,} tick records")
    print(f"📊 Columns: {list(df.columns)}")
    print(f"📅 Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
    
    return df

def preprocess_tick_data(df):
    """
    Clean and preprocess tick data
    """
    print("🔧 Preprocessing tick data...")
    
    # Convert timestamp to datetime
    if df['timestamp'].dtype == 'object':
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    elif df['timestamp'].dtype in ['int64', 'float64']:
        # Assume Unix timestamp in milliseconds
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    
    # Ensure UTC timezone
    if df['timestamp'].dt.tz is None:
        df['timestamp'] = df['timestamp'].dt.tz_localize('UTC')
    else:
        df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')
    
    # Sort by timestamp
    df = df.sort_values('timestamp').reset_index(drop=True)
    
    # Convert price and amount to numeric
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
    
    # Clean side column (buy/sell or 1/0)
    if df['side'].dtype == 'object':
        df['side'] = df['side'].map({'buy': 1, 'sell': 0, 'b': 1, 's': 0})
    
    # Remove any NaN values
    initial_len = len(df)
    df = df.dropna()
    final_len = len(df)
    
    if initial_len != final_len:
        print(f"⚠️  Removed {initial_len - final_len:,} rows with NaN values")
    
    print(f"✅ Preprocessed {len(df):,} tick records")
    return df

# For demo purposes, we'll create synthetic data if the file doesn't exist
def create_synthetic_data():
    """Create synthetic tick data for demonstration"""
    print("🎭 Creating synthetic tick data for demonstration...")
    
    # Generate 1 day of synthetic tick data
    start_time = pd.Timestamp('2024-01-01 00:00:00', tz='UTC')
    end_time = start_time + pd.Timedelta(days=1)
    
    # Base price around 45000 USD
    base_price = 45000.0
    
    # Generate timestamps (random intervals between 10ms to 1000ms)
    timestamps = []
    current_time = start_time
    
    while current_time < end_time:
        # Random interval between ticks
        interval_ms = np.random.exponential(100)  # Average 100ms between ticks
        current_time += pd.Timedelta(milliseconds=interval_ms)
        timestamps.append(current_time)
    
    n_ticks = len(timestamps)
    print(f"📊 Generated {n_ticks:,} synthetic ticks")
    
    # Generate prices with random walk + some volatility clustering
    price_changes = np.random.normal(0, 0.1, n_ticks)  # Small price changes
    prices = base_price + np.cumsum(price_changes)
    
    # Generate amounts (trade sizes)
    amounts = np.random.exponential(0.5, n_ticks)  # Exponential distribution for trade sizes
    
    # Generate sides (buy/sell) with slight bias
    sides = np.random.choice([0, 1], n_ticks, p=[0.48, 0.52])  # Slight buy bias
    
    # Create DataFrame
    synthetic_df = pd.DataFrame({
        'timestamp': timestamps,
        'price': prices,
        'amount': amounts,
        'side': sides
    })
    
    return synthetic_df

# Load data
try:
    if os.path.exists(CONFIG['DATA_PATH']):
        tick_data = load_tick_data(CONFIG['DATA_PATH'])
    else:
        print(f"⚠️  File not found: {CONFIG['DATA_PATH']}")
        print("🎭 Using synthetic data for demonstration")
        tick_data = create_synthetic_data()
except Exception as e:
    print(f"❌ Error loading data: {e}")
    print("🎭 Using synthetic data for demonstration")
    tick_data = create_synthetic_data()

# Preprocess the data
tick_data = preprocess_tick_data(tick_data)

# Display basic statistics
print("\n📈 Basic Statistics:")
print(f"Price range: ${tick_data['price'].min():.2f} - ${tick_data['price'].max():.2f}")
print(f"Average tick size: ${tick_data['amount'].mean():.4f}")
print(f"Buy/Sell ratio: {tick_data['side'].mean():.3f}")


# =============================================================================
# 📊 2. COMPUTE PER-MINUTE HIGH/LOW TICK PATHS
# =============================================================================

In [None]:
def compute_minute_candles(tick_data):
    """
    Resample tick data into 1-minute candles to get P0 (open prices)
    """
    print("🕐 Computing 1-minute candle opens...")
    
    # Set timestamp as index for resampling
    df_indexed = tick_data.set_index('timestamp')
    
    # Resample to 1-minute intervals
    candles = df_indexed.groupby(pd.Grouper(freq=CONFIG['CANDLE_INTERVAL'])).agg({
        'price': ['first', 'max', 'min', 'last'],
        'amount': 'sum',
        'side': 'count'
    }).dropna()
    
    # Flatten column names
    candles.columns = ['open', 'high', 'low', 'close', 'volume', 'tick_count']
    candles = candles.reset_index()
    candles.rename(columns={'timestamp': 'minute'}, inplace=True)
    
    print(f"✅ Generated {len(candles)} 1-minute candles")
    return candles

def compute_tick_paths(tick_data, candles):
    """
    For each 1-minute candle, compute the tick-level path to high and low
    """
    print("🛤️  Computing tick paths for each minute...")
    
    results = []
    
    for idx, candle in tqdm(candles.iterrows(), total=len(candles), desc="Processing candles"):
        minute_start = candle['minute']
        minute_end = minute_start + pd.Timedelta(minutes=1)
        
        # Get ticks for this minute
        minute_ticks = tick_data[
            (tick_data['timestamp'] >= minute_start) & 
            (tick_data['timestamp'] < minute_end)
        ].copy()
        
        if len(minute_ticks) == 0:
            continue
            
        P0 = candle['open']  # Open price
        high_price = candle['high']
        low_price = candle['low']
        
        # Track when high and low are first reached
        high_reached = False
        low_reached = False
        t_high = None
        t_low = None
        min_d_high = None
        min_d_low = None
        
        for _, tick in minute_ticks.iterrows():
            tick_time = tick['timestamp']
            tick_price = tick['price']
            
            # Check if high is reached for the first time
            if not high_reached and tick_price >= high_price:
                high_reached = True
                t_high = (tick_time - minute_start).total_seconds()
                min_d_high = abs(high_price - P0)
            
            # Check if low is reached for the first time
            if not low_reached and tick_price <= low_price:
                low_reached = True
                t_low = (tick_time - minute_start).total_seconds()
                min_d_low = abs(low_price - P0)
        
        # Determine which came first
        up_first = 0  # Default to low first
        if high_reached and low_reached:
            up_first = 1 if t_high < t_low else 0
        elif high_reached and not low_reached:
            up_first = 1
        elif low_reached and not high_reached:
            up_first = 0
        else:
            # Neither reached (shouldn't happen with proper data)
            continue
        
        # Calculate minimum distances if not already set
        if min_d_high is None:
            min_d_high = abs(high_price - P0)
        if min_d_low is None:
            min_d_low = abs(low_price - P0)
        
        # Set default times if not reached
        if t_high is None:
            t_high = 60.0  # End of minute
        if t_low is None:
            t_low = 60.0   # End of minute
        
        results.append({
            'minute': minute_start,
            'P0': P0,
            'high': high_price,
            'low': low_price,
            'min_d_high': min_d_high,
            'min_d_low': min_d_low,
            't_high': t_high,
            't_low': t_low,
            'up_first': up_first,
            'tick_count': len(minute_ticks),
            'volume': minute_ticks['amount'].sum()
        })
    
    results_df = pd.DataFrame(results)
    print(f"✅ Computed tick paths for {len(results_df)} minutes")
    
    return results_df

# Compute candles and tick paths
candles = compute_minute_candles(tick_data)
tick_paths = compute_tick_paths(tick_data, candles)

# Display statistics
print("\n📊 Tick Path Statistics:")
print(f"Up first ratio: {tick_paths['up_first'].mean():.3f}")
print(f"Average time to high: {tick_paths['t_high'].mean():.1f}s")
print(f"Average time to low: {tick_paths['t_low'].mean():.1f}s")
print(f"Average distance to high: ${tick_paths['min_d_high'].mean():.2f}")
print(f"Average distance to low: ${tick_paths['min_d_low'].mean():.2f}")

# Visualize distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Distribution of min_d_high
axes[0, 0].hist(tick_paths['min_d_high'], bins=50, alpha=0.7, color='green')
axes[0, 0].set_title('Distribution of Distance to High')
axes[0, 0].set_xlabel('Distance (USD)')
axes[0, 0].set_ylabel('Frequency')

# Distribution of min_d_low
axes[0, 1].hist(tick_paths['min_d_low'], bins=50, alpha=0.7, color='red')
axes[0, 1].set_title('Distribution of Distance to Low')
axes[0, 1].set_xlabel('Distance (USD)')
axes[0, 1].set_ylabel('Frequency')

# Time to high vs time to low
axes[1, 0].scatter(tick_paths['t_high'], tick_paths['t_low'], alpha=0.5)
axes[1, 0].set_title('Time to High vs Time to Low')
axes[1, 0].set_xlabel('Time to High (seconds)')
axes[1, 0].set_ylabel('Time to Low (seconds)')
axes[1, 0].plot([0, 60], [0, 60], 'r--', alpha=0.5)

# Up first distribution by hour
tick_paths['hour'] = tick_paths['minute'].dt.hour
hourly_up_first = tick_paths.groupby('hour')['up_first'].mean()
axes[1, 1].bar(hourly_up_first.index, hourly_up_first.values, alpha=0.7)
axes[1, 1].set_title('Up First Ratio by Hour')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Up First Ratio')

plt.tight_layout()
plt.show()

# =============================================================================
# 🧠 3. FEATURE ENGINEERING
# =============================================================================

In [None]:
def engineer_features(tick_data, tick_paths):
    """
    Create features for machine learning model
    """
    print("🔬 Engineering features...")
    
    features_df = tick_paths.copy()
    
    # Time-based features
    features_df['hour'] = features_df['minute'].dt.hour
    features_df['minute_of_hour'] = features_df['minute'].dt.minute
    features_df['day_of_week'] = features_df['minute'].dt.dayofweek
    
    # Cyclical encoding for time features
    features_df['hour_sin'] = np.sin(2 * np.pi * features_df['hour'] / 24)
    features_df['hour_cos'] = np.cos(2 * np.pi * features_df['hour'] / 24)
    features_df['minute_sin'] = np.sin(2 * np.pi * features_df['minute_of_hour'] / 60)
    features_df['minute_cos'] = np.cos(2 * np.pi * features_df['minute_of_hour'] / 60)
    features_df['dow_sin'] = np.sin(2 * np.pi * features_df['day_of_week'] / 7)
    features_df['dow_cos'] = np.cos(2 * np.pi * features_df['day_of_week'] / 7)
    
    print("⏰ Added time-based features")
    
    # Tick imbalance features
    print("⚖️  Computing tick imbalance features...")
    
    imbalance_features = []
    
    for idx, row in tqdm(features_df.iterrows(), total=len(features_df), desc="Computing imbalances"):
        minute_start = row['minute']
        minute_end = minute_start + pd.Timedelta(minutes=1)
        
        # Get ticks for this minute
        minute_ticks = tick_data[
            (tick_data['timestamp'] >= minute_start) & 
            (tick_data['timestamp'] < minute_end)
        ]
        
        # Compute imbalance in first X seconds
        imbalance_end = minute_start + pd.Timedelta(seconds=CONFIG['IMBALANCE_WINDOW'])
        early_ticks = minute_ticks[minute_ticks['timestamp'] <= imbalance_end]
        
        if len(early_ticks) > 0:
            buy_ticks = (early_ticks['side'] == 1).sum()
            sell_ticks = (early_ticks['side'] == 0).sum()
            total_ticks = len(early_ticks)
            
            imbalance_ratio = (buy_ticks - sell_ticks) / total_ticks if total_ticks > 0 else 0
            buy_volume = early_ticks[early_ticks['side'] == 1]['amount'].sum()
            sell_volume = early_ticks[early_ticks['side'] == 0]['amount'].sum()
            volume_imbalance = (buy_volume - sell_volume) / (buy_volume + sell_volume) if (buy_volume + sell_volume) > 0 else 0
        else:
            imbalance_ratio = 0
            volume_imbalance = 0
            total_ticks = 0
        
        # Average tick size
        avg_tick_size = minute_ticks['amount'].mean() if len(minute_ticks) > 0 else 0
        
        # Price volatility within the minute
        price_std = minute_ticks['price'].std() if len(minute_ticks) > 1 else 0
        
        imbalance_features.append({
            'tick_imbalance': imbalance_ratio,
            'volume_imbalance': volume_imbalance,
            'early_tick_count': total_ticks,
            'avg_tick_size': avg_tick_size,
            'intra_minute_volatility': price_std
        })
    
    imbalance_df = pd.DataFrame(imbalance_features)
    features_df = pd.concat([features_df, imbalance_df], axis=1)
    
    print("✅ Added tick imbalance features")
    
    # Rolling features
    print("📊 Computing rolling features...")
    
    # Sort by time for rolling calculations
    features_df = features_df.sort_values('minute').reset_index(drop=True)
    
    # Rolling volatility
    features_df['rolling_volatility'] = features_df['intra_minute_volatility'].rolling(
        window=CONFIG['VOLATILITY_WINDOW'], min_periods=1
    ).mean()
    
    # Rolling volume
    features_df['rolling_volume'] = features_df['volume'].rolling(
        window=CONFIG['VOLATILITY_WINDOW'], min_periods=1
    ).mean()
    
    # Rolling tick count
    features_df['rolling_tick_count'] = features_df['tick_count'].rolling(
        window=CONFIG['VOLATILITY_WINDOW'], min_periods=1
    ).mean()
    
    # Price momentum features
    features_df['price_change_1'] = features_df['P0'].pct_change(1)
    features_df['price_change_5'] = features_df['P0'].pct_change(5)
    
    # Relative position within recent range
    features_df['price_position'] = (
        (features_df['P0'] - features_df['P0'].rolling(10, min_periods=1).min()) /
        (features_df['P0'].rolling(10, min_periods=1).max() - features_df['P0'].rolling(10, min_periods=1).min())
    ).fillna(0.5)
    
    print("✅ Added rolling features")
    
    # Fill any remaining NaN values
    features_df = features_df.fillna(0)
    
    print(f"🎯 Feature engineering complete. Shape: {features_df.shape}")
    print(f"📋 Features: {features_df.columns.tolist()}")
    
    return features_df

# Engineer features
features_df = engineer_features(tick_data, tick_paths)

# Define feature columns (exclude target and metadata)
feature_columns = [
    'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos', 'dow_sin', 'dow_cos',
    'tick_imbalance', 'volume_imbalance', 'early_tick_count', 'avg_tick_size',
    'intra_minute_volatility', 'rolling_volatility', 'rolling_volume',
    'rolling_tick_count', 'price_change_1', 'price_change_5', 'price_position',
    'min_d_high', 'min_d_low', 't_high', 't_low', 'tick_count', 'volume'
]

X = features_df[feature_columns]
y = features_df['up_first']

print(f"🎯 Features shape: {X.shape}")
print(f"🏷️  Target distribution: {y.value_counts().to_dict()}")

# =============================================================================
# 🤖 4. MACHINE LEARNING CLASSIFIER
# =============================================================================

In [None]:
def train_xgboost_model(X, y):
    """
    Train XGBoost classifier with time series cross-validation
    """
    print("🚀 Training XGBoost model...")
    
    # Time series split (preserving temporal order)
    tscv = TimeSeriesSplit(n_splits=CONFIG['CV_FOLDS'])
    
    # Split data maintaining temporal order
    split_idx = int(len(X) * (1 - CONFIG['TEST_SIZE']))
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    print(f"📊 Train set: {X_train.shape[0]} samples")
    print(f"📊 Test set: {X_test.shape[0]} samples")
    
    # XGBoost parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'min_child_weight': [1, 3, 5]
    }
    
    # Initialize XGBoost classifier
    xgb_model = xgb.XGBClassifier(
        random_state=CONFIG['RANDOM_STATE'],
        eval_metric='logloss',
        use_label_encoder=False
    )
    
    # Randomized search with time series cross-validation
    print("🔍 Performing hyperparameter tuning...")
    
    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_grid,
        n_iter=20,  # Number of parameter settings sampled
        cv=tscv,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=CONFIG['RANDOM_STATE'],
        verbose=1
    )
    
    # Fit the model
    random_search.fit(X_train, y_train)
    
    # Best model
    best_model = random_search.best_estimator_
    
    print(f"🏆 Best parameters: {random_search.best_params_}")
    print(f"🎯 Best CV score: {random_search.best_score_:.4f}")
    
    # Predictions
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    y_test_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Evaluation metrics
    print("\n📈 Model Performance:")
    print("=" * 50)
    
    # Training metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"Training Accuracy: {train_accuracy:.4f}")
    
    # Test metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)
    test_auc = roc_auc_score(y_test, y_test_proba)
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test Precision: {test_precision:.4f}")
    print(f"Test Recall: {test_recall:.4f}")
    print(f"Test AUC: {test_auc:.4f}")
    
    print("\n📋 Detailed Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    return best_model, X_train, X_test, y_train, y_test, y_test_pred, y_test_proba

# Train the model
model, X_train, X_test, y_train, y_test, y_test_pred, y_test_proba = train_xgboost_model(X, y)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n🔝 Top 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importance (XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


# =============================================================================
# 📈 5. SHAP ANALYSIS
# =============================================================================

In [None]:
def perform_shap_analysis(model, X_train, X_test, feature_columns):
    """
    Perform SHAP analysis for model explainability
    """
    print("🔍 Performing SHAP analysis...")
    
    # Create SHAP explainer
    explainer = shap.TreeExplainer(model)
    
    # Calculate SHAP values for test set (sample if too large)
    sample_size = min(1000, len(X_test))
    X_sample = X_test.sample(n=sample_size, random_state=CONFIG['RANDOM_STATE'])
    shap_values = explainer.shap_values(X_sample)
    
    print(f"📊 SHAP values calculated for {sample_size} samples")
    
    # SHAP summary plot
    plt.figure(figsize=(12, 8))
    shap.summary_plot(shap_values, X_sample, feature_names=feature_columns, show=False)
    plt.title('SHAP Summary Plot - Feature Impact on Predictions')
    plt.tight_layout()
    plt.show()
    
    # SHAP feature importance
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_sample, feature_names=feature_columns, plot_type="bar", show=False)
    plt.title('SHAP Feature Importance')
    plt.tight_layout()
    plt.show()
    
    # Feature interaction analysis (top features only)
    top_5_features = feature_importance.head(5)['feature'].tolist()
    top_5_indices = [feature_columns.index(feat) for feat in top_5_features]
    
    print(f"🔗 Analyzing interactions for top 5 features: {top_5_features}")
    
    for i, feat_idx in enumerate(top_5_indices[:3]):  # Show top 3 to avoid clutter
        plt.figure(figsize=(10, 6))
        shap.dependence_plot(feat_idx, shap_values, X_sample, feature_names=feature_columns, show=False)
        plt.title(f'SHAP Dependence Plot - {feature_columns[feat_idx]}')
        plt.tight_layout()
        plt.show()
    
    return shap_values, explainer

# Perform SHAP analysis
shap_values, explainer = perform_shap_analysis(model, X_train, X_test, feature_columns)

# =============================================================================
# 💰 6. BACKTEST STRATEGY
# =============================================================================

In [None]:
def backtest_strategy(features_df, model, X):
    """
    Backtest the trading strategy
    """
    print("💰 Running backtest...")
    
    # Create backtest dataframe
    backtest_df = features_df.copy()
    backtest_df['predicted_up_first'] = model.predict(X)
    backtest_df['prediction_proba'] = model.predict_proba(X)[:, 1]
    
    # Initialize backtest variables
    initial_capital = CONFIG['INITIAL_CAPITAL']
    position_size = CONFIG['POSITION_SIZE']
    
    backtest_df['position'] = 0  # 1 for long, -1 for short, 0 for no position
    backtest_df['entry_price'] = 0.0
    backtest_df['tp_level'] = 0.0
    backtest_df['sl_level'] = 0.0
    backtest_df['trade_pnl'] = 0.0
    backtest_df['cumulative_pnl'] = 0.0
    backtest_df['trade_outcome'] = ''  # 'TP', 'SL', or 'None'
    
    # Track portfolio metrics
    total_trades = 0
    winning_trades = 0
    losing_trades = 0
    cumulative_pnl = 0.0
    max_drawdown = 0.0
    peak_pnl = 0.0
    
    print("🔄 Simulating trades...")
    
    for idx in tqdm(range(len(backtest_df)), desc="Backtesting"):
        row = backtest_df.iloc[idx]
        
        # Skip if insufficient historical data for prediction
        if idx < CONFIG['VOLATILITY_WINDOW']:
            continue
        
        # Get prediction and actual outcome
        predicted_up_first = row['predicted_up_first']
        actual_up_first = row['up_first']
        prediction_confidence = row['prediction_proba']
        
        # Only trade if confidence is above threshold (optional filter)
        confidence_threshold = 0.55  # Trade only if confidence > 55%
        if abs(prediction_confidence - 0.5) < (confidence_threshold - 0.5):
            continue
        
        # Entry price (open of the minute)
        entry_price = row['P0']
        
        # Set position based on prediction
        if predicted_up_first == 1:
            # Predict high will be reached first - go long
            position = 1
            tp_level = entry_price + row['min_d_high']
            sl_level = entry_price - row['min_d_low']
        else:
            # Predict low will be reached first - go short
            position = -1
            tp_level = entry_price - row['min_d_low']
            sl_level = entry_price + row['min_d_high']
        
        # Determine trade outcome based on actual market behavior
        if actual_up_first == 1:
            # High was reached first
            if position == 1:
                # Long position - TP hit
                trade_pnl = row['min_d_high'] * (position_size / entry_price)
                trade_outcome = 'TP'
            else:
                # Short position - SL hit
                trade_pnl = -row['min_d_high'] * (position_size / entry_price)
                trade_outcome = 'SL'
        else:
            # Low was reached first
            if position == -1:
                # Short position - TP hit
                trade_pnl = row['min_d_low'] * (position_size / entry_price)
                trade_outcome = 'TP'
            else:
                # Long position - SL hit
                trade_pnl = -row['min_d_low'] * (position_size / entry_price)
                trade_outcome = 'SL'
        
        # Update backtest dataframe
        backtest_df.loc[idx, 'position'] = position
        backtest_df.loc[idx, 'entry_price'] = entry_price
        backtest_df.loc[idx, 'tp_level'] = tp_level
        backtest_df.loc[idx, 'sl_level'] = sl_level
        backtest_df.loc[idx, 'trade_pnl'] = trade_pnl
        backtest_df.loc[idx, 'trade_outcome'] = trade_outcome
        
        # Update cumulative metrics
        cumulative_pnl += trade_pnl
        backtest_df.loc[idx, 'cumulative_pnl'] = cumulative_pnl
        
        # Track trade statistics
        total_trades += 1
        if trade_pnl > 0:
            winning_trades += 1
        else:
            losing_trades += 1
        
        # Update drawdown tracking
        if cumulative_pnl > peak_pnl:
            peak_pnl = cumulative_pnl
        
        current_drawdown = peak_pnl - cumulative_pnl
        if current_drawdown > max_drawdown:
            max_drawdown = current_drawdown
    
    # Calculate final metrics
    win_rate = winning_trades / total_trades if total_trades > 0 else 0
    avg_win = backtest_df[backtest_df['trade_pnl'] > 0]['trade_pnl'].mean()
    avg_loss = backtest_df[backtest_df['trade_pnl'] < 0]['trade_pnl'].mean()
    profit_factor = abs(avg_win * winning_trades / (avg_loss * losing_trades)) if losing_trades > 0 and avg_loss != 0 else float('inf')
    
    # Sharpe ratio (simplified - assuming daily returns)
    returns = backtest_df['trade_pnl'].dropna()
    sharpe_ratio = returns.mean() / returns.std() * np.sqrt(252) if returns.std() > 0 else 0
    
    print("\n📊 Backtest Results:")
    print("=" * 50)
    print(f"Total Trades: {total_trades:,}")
    print(f"Winning Trades: {winning_trades:,}")
    print(f"Losing Trades: {losing_trades:,}")
    print(f"Win Rate: {win_rate:.2%}")
    print(f"Total PnL: ${cumulative_pnl:.2f}")
    print(f"Max Drawdown: ${max_drawdown:.2f}")
    print(f"Average Win: ${avg_win:.2f}" if not pd.isna(avg_win) else "Average Win: N/A")
    print(f"Average Loss: ${avg_loss:.2f}" if not pd.isna(avg_loss) else "Average Loss: N/A")
    print(f"Profit Factor: {profit_factor:.2f}" if profit_factor != float('inf') else "Profit Factor: ∞")
    print(f"Sharpe Ratio: {sharpe_ratio:.2f}")
    print(f"Return on Capital: {(cumulative_pnl / initial_capital) * 100:.2f}%")
    
    return backtest_df, {
        'total_trades': total_trades,
        'winning_trades': winning_trades,
        'losing_trades': losing_trades,
        'win_rate': win_rate,
        'total_pnl': cumulative_pnl,
        'max_drawdown': max_drawdown,
        'avg_win': avg_win,
        'avg_loss': avg_loss,
        'profit_factor': profit_factor,
        'sharpe_ratio': sharpe_ratio,
        'return_on_capital': (cumulative_pnl / initial_capital) * 100
    }

# Run backtest
backtest_results, metrics = backtest_strategy(features_df, model, X)

# Plot backtest results
fig, axes = plt.subplots(3, 2, figsize=(18, 15))

# 1. Cumulative PnL curve
traded_results = backtest_results[backtest_results['trade_pnl'] != 0].copy()
if len(traded_results) > 0:
    axes[0, 0].plot(traded_results['minute'], traded_results['cumulative_pnl'], linewidth=2)
    axes[0, 0].set_title('Cumulative PnL Over Time')
    axes[0, 0].set_xlabel('Time')
    axes[0, 0].set_ylabel('Cumulative PnL ($)')
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].axhline(y=0, color='black', linestyle='--', alpha=0.5)

# 2. Trade PnL distribution
trade_pnls = traded_results['trade_pnl'].dropna()
if len(trade_pnls) > 0:
    axes[0, 1].hist(trade_pnls, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.7)
    axes[0, 1].set_title('Distribution of Trade PnL')
    axes[0, 1].set_xlabel('Trade PnL ($)')
    axes[0, 1].set_ylabel('Frequency')

# 3. Win/Loss by prediction confidence
if len(traded_results) > 0:
    win_trades = traded_results[traded_results['trade_pnl'] > 0]
    loss_trades = traded_results[traded_results['trade_pnl'] <= 0]
    
    axes[1, 0].scatter(win_trades['prediction_proba'], win_trades['trade_pnl'], 
                      color='green', alpha=0.6, label='Wins', s=20)
    axes[1, 0].scatter(loss_trades['prediction_proba'], loss_trades['trade_pnl'], 
                      color='red', alpha=0.6, label='Losses', s=20)
    axes[1, 0].set_title('Trade Outcome vs Prediction Confidence')
    axes[1, 0].set_xlabel('Prediction Probability')
    axes[1, 0].set_ylabel('Trade PnL ($)')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    axes[1, 0].axhline(y=0, color='black', linestyle='--', alpha=0.5)

# 4. Monthly returns heatmap (if we have enough data)
if len(traded_results) > 0:
    traded_results['year_month'] = traded_results['minute'].dt.to_period('M')
    monthly_returns = traded_results.groupby('year_month')['trade_pnl'].sum()
    
    if len(monthly_returns) > 1:
        monthly_df = monthly_returns.reset_index()
        monthly_df['year'] = monthly_df['year_month'].dt.year
        monthly_df['month'] = monthly_df['year_month'].dt.month
        
        pivot_table = monthly_df.pivot(index='year', columns='month', values='trade_pnl')
        
        sns.heatmap(pivot_table, annot=True, cmap='RdYlGn', center=0, 
                   fmt='.0f', ax=axes[1, 1])
        axes[1, 1].set_title('Monthly Returns Heatmap')
    else:
        axes[1, 1].text(0.5, 0.5, 'Insufficient data\nfor monthly analysis', 
                       ha='center', va='center', transform=axes[1, 1].transAxes)
        axes[1, 1].set_title('Monthly Returns Heatmap')

# 5. Drawdown curve
if len(traded_results) > 0:
    traded_results['running_max'] = traded_results['cumulative_pnl'].expanding().max()
    traded_results['drawdown'] = traded_results['cumulative_pnl'] - traded_results['running_max']
    
    axes[2, 0].fill_between(traded_results['minute'], traded_results['drawdown'], 0, 
                           color='red', alpha=0.3)
    axes[2, 0].plot(traded_results['minute'], traded_results['drawdown'], color='red', linewidth=1)
    axes[2, 0].set_title('Drawdown Over Time')
    axes[2, 0].set_xlabel('Time')
    axes[2, 0].set_ylabel('Drawdown ($)')
    axes[2, 0].grid(True, alpha=0.3)

# 6. Rolling win rate
if len(traded_results) > 0:
    window_size = max(50, len(traded_results) // 20)  # Adaptive window size
    traded_results['rolling_wins'] = (traded_results['trade_pnl'] > 0).rolling(
        window=window_size, min_periods=10
    ).mean()
    
    axes[2, 1].plot(traded_results['minute'], traded_results['rolling_wins'], linewidth=2)
    axes[2, 1].axhline(y=0.5, color='black', linestyle='--', alpha=0.5)
    axes[2, 1].set_title(f'Rolling Win Rate ({window_size} trades)')
    axes[2, 1].set_xlabel('Time')
    axes[2, 1].set_ylabel('Win Rate')
    axes[2, 1].set_ylim(0, 1)
    axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Trade analysis by hour of day
if len(traded_results) > 0:
    hourly_analysis = traded_results.groupby(traded_results['minute'].dt.hour).agg({
        'trade_pnl': ['count', 'sum', 'mean'],
        'prediction_proba': 'mean'
    }).round(3)
    
    hourly_analysis.columns = ['Trade_Count', 'Total_PnL', 'Avg_PnL', 'Avg_Confidence']
    hourly_analysis['Win_Rate'] = traded_results.groupby(
        traded_results['minute'].dt.hour
    ).apply(lambda x: (x['trade_pnl'] > 0).mean()).round(3)
    
    print("\n🕐 Performance by Hour of Day:")
    print(hourly_analysis)


# =============================================================================
# 💾 7. EXPORT & SAVE RESULTS
# =============================================================================

In [None]:
def save_results(backtest_results, metrics, model, feature_importance):
    """
    Save all results to disk
    """
    print("💾 Saving results to disk...")
    
    output_dir = Path(CONFIG['OUTPUT_DIR'])
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save backtest results
    backtest_file = output_dir / f"backtest_results_{timestamp}.parquet"
    backtest_results.to_parquet(backtest_file)
    print(f"📊 Backtest results saved to: {backtest_file}")
    
    # Save model metrics
    metrics_file = output_dir / f"model_metrics_{timestamp}.json"
    import json
    
    # Convert numpy types to Python types for JSON serialization
    metrics_json = {}
    for key, value in metrics.items():
        if isinstance(value, (np.integer, np.floating)):
            metrics_json[key] = float(value)
        elif pd.isna(value):
            metrics_json[key] = None
        else:
            metrics_json[key] = value
    
    with open(metrics_file, 'w') as f:
        json.dump(metrics_json, f, indent=2)
    print(f"📈 Model metrics saved to: {metrics_file}")
    
    # Save feature importance
    feature_importance_file = output_dir / f"feature_importance_{timestamp}.csv"
    feature_importance.to_csv(feature_importance_file, index=False)
    print(f"🔝 Feature importance saved to: {feature_importance_file}")
    
    # Save model
    model_file = output_dir / f"xgboost_model_{timestamp}.json"
    model.save_model(str(model_file))
    print(f"🤖 Model saved to: {model_file}")
    
    # Create summary report
    summary_file = output_dir / f"trading_strategy_summary_{timestamp}.txt"
    
    with open(summary_file, 'w') as f:
        f.write("BTC/USDT Tick-Level Micro-Prediction Strategy Report\n")
        f.write("=" * 60 + "\n\n")
        
        f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Data period: {backtest_results['minute'].min()} to {backtest_results['minute'].max()}\n\n")
        
        f.write("BACKTEST PERFORMANCE SUMMARY:\n")
        f.write("-" * 30 + "\n")
        f.write(f"Total Trades: {metrics['total_trades']:,}\n")
        f.write(f"Win Rate: {metrics['win_rate']:.2%}\n")
        f.write(f"Total PnL: ${metrics['total_pnl']:.2f}\n")
        f.write(f"Max Drawdown: ${metrics['max_drawdown']:.2f}\n")
        f.write(f"Sharpe Ratio: {metrics['sharpe_ratio']:.2f}\n")
        f.write(f"Return on Capital: {metrics['return_on_capital']:.2f}%\n\n")
        
        f.write("TOP 10 MOST IMPORTANT FEATURES:\n")
        f.write("-" * 35 + "\n")
        for idx, row in feature_importance.head(10).iterrows():
            f.write(f"{row['feature']}: {row['importance']:.4f}\n")
        
        f.write(f"\nSTRATEGY CONFIGURATION:\n")
        f.write("-" * 25 + "\n")
        for key, value in CONFIG.items():
            f.write(f"{key}: {value}\n")
    
    print(f"📋 Summary report saved to: {summary_file}")
    
    print("\n✅ All results saved successfully!")
    
    return {
        'backtest_file': backtest_file,
        'metrics_file': metrics_file,
        'feature_importance_file': feature_importance_file,
        'model_file': model_file,
        'summary_file': summary_file
    }

# Save all results
saved_files = save_results(backtest_results, metrics, model, feature_importance)

# =============================================================================
# 🎯 FINAL SUMMARY AND RECOMMENDATIONS
# =============================================================================

In [None]:
print("\n" + "="*80)
print("🎯 STRATEGY ANALYSIS COMPLETE")
print("="*80)

print(f"\n📊 PERFORMANCE OVERVIEW:")
print(f"   • Processed {len(tick_data):,} ticks into {len(features_df):,} 1-minute candles")
print(f"   • Trained model on {len(X_train):,} samples, tested on {len(X_test):,} samples")
print(f"   • Executed {metrics['total_trades']:,} trades with {metrics['win_rate']:.1%} win rate")
print(f"   • Generated ${metrics['total_pnl']:.2f} total PnL ({metrics['return_on_capital']:.1f}% return)")

print(f"\n🔑 KEY INSIGHTS:")
most_important_feature = feature_importance.iloc[0]['feature']
print(f"   • Most predictive feature: {most_important_feature}")
print(f"   • Model AUC: {roc_auc_score(y_test, y_test_proba):.3f}")

if metrics['total_trades'] > 0:
    avg_trade_duration = "~1 minute"  # Since we trade every minute
    print(f"   • Average trade duration: {avg_trade_duration}")
    
    if metrics['win_rate'] > 0.5:
        print("   • ✅ Strategy shows positive edge")
    else:
        print("   • ⚠️  Strategy shows negative edge - consider refinement")

print(f"\n🚀 NEXT STEPS:")
print("   • Test on out-of-sample data from different time periods")
print("   • Implement position sizing based on prediction confidence")
print("   • Add transaction costs and slippage modeling")
print("   • Consider ensemble methods or alternative algorithms")
print("   • Implement live trading with proper risk management")

print(f"\n📁 All results saved to: {CONFIG['OUTPUT_DIR']}")

print("\n" + "="*80)
print("🏁 PIPELINE EXECUTION COMPLETE")
print("="*80)