# LSTM Pipeline for BTC Price Movement Prediction

## Pipeline Overview:
1. **Data Preprocessing**: Feature scaling, correlation analysis, PCA
2. **Sequence Construction**: Sliding windows for temporal context
3. **LSTM Architecture**: Regularized model with dropout and L2
4. **Training Strategy**: Time-series CV, early stopping, LR scheduling
5. **Evaluation**: AUC, Precision, Recall, F1-Score

## 1. Import Libraries and Load Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, classification_report, 
                             confusion_matrix)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")

TensorFlow version: 2.19.0
GPU Available: False


In [2]:
# Load data
df = pd.read_csv('BTC_Cleaned_Data.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"\nTarget distribution:")
print(df['Target'].value_counts(normalize=True))
df.head()

Dataset shape: (3377, 24)
Date range: 2015-01-31 00:00:00 to 2024-04-29 00:00:00

Target distribution:


KeyError: 'Target'

## 2. Data Preprocessing

### 2.1 Feature Engineering & Selection

In [None]:
# Separate features and target
# Exclude date and target columns
feature_cols = [col for col in df.columns if col not in ['date', 'Target']]
X = df[feature_cols].values
y = df['Target'].values

print(f"Features: {len(feature_cols)}")
print(f"Feature names: {feature_cols}")
print(f"\nFeature shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Check for correlation to identify redundant features
correlation_matrix = df[feature_cols].corr()

# Find highly correlated features (threshold > 0.9)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((
                correlation_matrix.columns[i], 
                correlation_matrix.columns[j], 
                correlation_matrix.iloc[i, j]
            ))

print(f"Highly correlated feature pairs (|corr| > 0.9): {len(high_corr_pairs)}")
for feat1, feat2, corr in high_corr_pairs[:10]:  # Show first 10
    print(f"  {feat1} <-> {feat2}: {corr:.3f}")
    
# Visualize correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

### 2.2 Feature Scaling

In [None]:
# Use StandardScaler for LSTM (better for features with outliers)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature scaling complete")
print(f"Scaled data shape: {X_scaled.shape}")
print(f"Sample statistics after scaling:")
print(f"  Mean: {X_scaled.mean(axis=0)[:5]}")  # Show first 5
print(f"  Std: {X_scaled.std(axis=0)[:5]}")    # Show first 5

### 2.3 PCA for Dimensionality Reduction (Optional)

In [None]:
# Apply PCA to reduce dimensionality while retaining 95% variance
USE_PCA = True  # Set to False to skip PCA

if USE_PCA:
    pca = PCA(n_components=0.95, random_state=42)
    X_processed = pca.fit_transform(X_scaled)
    
    print(f"PCA applied:")
    print(f"  Original features: {X_scaled.shape[1]}")
    print(f"  Reduced features: {X_processed.shape[1]}")
    print(f"  Explained variance ratio: {pca.explained_variance_ratio_.sum():.4f}")
    print(f"  Components retained: {pca.n_components_}")
    
    # Plot cumulative explained variance
    plt.figure(figsize=(10, 5))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
    plt.title('PCA - Cumulative Explained Variance')
    plt.grid(True)
    plt.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
    plt.legend()
    plt.show()
else:
    X_processed = X_scaled
    print("PCA skipped - using all features")

## 3. Sequence Construction (Sliding Windows)

Create temporal sequences for LSTM input. Each sequence contains multiple days of historical data.

In [None]:
def create_sequences(X, y, sequence_length=30):
    """
    Create sequences for LSTM input.
    
    Args:
        X: Feature array (samples, features)
        y: Target array (samples,)
        sequence_length: Number of time steps to look back
        
    Returns:
        X_seq: Sequences (samples, sequence_length, features)
        y_seq: Target values for each sequence
    """
    X_seq, y_seq = [], []
    
    for i in range(sequence_length, len(X)):
        X_seq.append(X[i-sequence_length:i])
        y_seq.append(y[i])
    
    return np.array(X_seq), np.array(y_seq)

# Create sequences with configurable window size
SEQUENCE_LENGTH = 30  # Use 30 days of history (adjustable: 7, 14, 30, 60)

X_sequences, y_sequences = create_sequences(X_processed, y, SEQUENCE_LENGTH)

print(f"Sequence creation complete:")
print(f"  Sequence length: {SEQUENCE_LENGTH} days")
print(f"  Original samples: {len(X_processed)}")
print(f"  Sequence samples: {len(X_sequences)}")
print(f"  Sequence shape: {X_sequences.shape}")
print(f"  Target shape: {y_sequences.shape}")
print(f"\nTarget distribution in sequences:")
print(pd.Series(y_sequences).value_counts(normalize=True))

## 4. Time-Series Train/Validation/Test Split

In [None]:
# Time-series split: 70% train, 15% validation, 15% test
# IMPORTANT: No shuffling to maintain temporal order
train_size = int(0.7 * len(X_sequences))
val_size = int(0.15 * len(X_sequences))

X_train = X_sequences[:train_size]
y_train = y_sequences[:train_size]

X_val = X_sequences[train_size:train_size+val_size]
y_val = y_sequences[train_size:train_size+val_size]

X_test = X_sequences[train_size+val_size:]
y_test = y_sequences[train_size+val_size:]

print("Time-series split completed:")
print(f"  Training samples: {len(X_train)} ({len(X_train)/len(X_sequences)*100:.1f}%)")
print(f"  Validation samples: {len(X_val)} ({len(X_val)/len(X_sequences)*100:.1f}%)")
print(f"  Test samples: {len(X_test)} ({len(X_test)/len(X_sequences)*100:.1f}%)")
print(f"\nTarget distributions:")
print(f"  Train - Up: {y_train.mean():.3f}, Down: {1-y_train.mean():.3f}")
print(f"  Val   - Up: {y_val.mean():.3f}, Down: {1-y_val.mean():.3f}")
print(f"  Test  - Up: {y_test.mean():.3f}, Down: {1-y_test.mean():.3f}")

## 5. Build LSTM Model with Regularization

Architecture:
- **Input**: Sequences of shape (sequence_length, n_features)
- **LSTM Layers**: 1-2 layers with 32-64 units
- **Dropout**: 0.3-0.5 for both input and recurrent connections
- **L2 Regularization**: On LSTM and Dense layers
- **Output**: Binary classification (sigmoid activation)

In [None]:
def build_lstm_model(sequence_length, n_features, 
                     lstm_units=[64, 32], 
                     dropout_rate=0.3,
                     recurrent_dropout=0.2,
                     l2_reg=0.01,
                     learning_rate=0.0005):
    """
    Build LSTM model with regularization.
    
    Args:
        sequence_length: Number of time steps
        n_features: Number of features per time step
        lstm_units: List of units for each LSTM layer
        dropout_rate: Dropout rate for LSTM layers
        recurrent_dropout: Recurrent dropout rate
        l2_reg: L2 regularization coefficient
        learning_rate: Learning rate for Adam optimizer
    """
    model = keras.Sequential(name='LSTM_BTC_Predictor')
    
    # First LSTM layer
    model.add(layers.LSTM(
        units=lstm_units[0],
        return_sequences=len(lstm_units) > 1,  # Return sequences if more layers
        input_shape=(sequence_length, n_features),
        dropout=dropout_rate,
        recurrent_dropout=recurrent_dropout,
        kernel_regularizer=regularizers.l2(l2_reg),
        recurrent_regularizer=regularizers.l2(l2_reg),
        name='lstm_1'
    ))
    
    # Additional LSTM layers
    for i, units in enumerate(lstm_units[1:], start=2):
        return_seq = i < len(lstm_units)  # Return sequences if not last layer
        model.add(layers.LSTM(
            units=units,
            return_sequences=return_seq,
            dropout=dropout_rate,
            recurrent_dropout=recurrent_dropout,
            kernel_regularizer=regularizers.l2(l2_reg),
            recurrent_regularizer=regularizers.l2(l2_reg),
            name=f'lstm_{i}'
        ))
    
    # Dropout layer before output
    model.add(layers.Dropout(dropout_rate, name='dropout_final'))
    
    # Output layer (binary classification)
    model.add(layers.Dense(
        1, 
        activation='sigmoid',
        kernel_regularizer=regularizers.l2(l2_reg),
        name='output'
    ))
    
    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            keras.metrics.AUC(name='auc'),
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall')
        ]
    )
    
    return model

# Build model
n_features = X_train.shape[2]
model = build_lstm_model(
    sequence_length=SEQUENCE_LENGTH,
    n_features=n_features,
    lstm_units=[64, 32],      # 2-layer LSTM
    dropout_rate=0.3,          # 30% dropout
    recurrent_dropout=0.2,     # 20% recurrent dropout
    l2_reg=0.01,               # L2 regularization
    learning_rate=0.0005       # Conservative learning rate
)

print("Model architecture:")
model.summary()
print(f"\nTotal parameters: {model.count_params():,}")

## 6. Training Strategy with Callbacks

Callbacks:
- **EarlyStopping**: Stop training when val_loss stops improving
- **ReduceLROnPlateau**: Reduce learning rate when val_loss plateaus
- **ModelCheckpoint**: Save best model based on val_auc

In [None]:
# Define callbacks
callbacks = [
    # Early stopping: stop if val_loss doesn't improve for 10 epochs
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1,
        mode='min'
    ),
    
    # Reduce learning rate when val_loss plateaus
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,          # Reduce LR by half
        patience=5,          # Wait 5 epochs before reducing
        min_lr=1e-7,         # Minimum learning rate
        verbose=1,
        mode='min'
    ),
    
    # Save best model based on validation AUC
    ModelCheckpoint(
        filepath='btc_lstm_best_model.h5',
        monitor='val_auc',
        save_best_only=True,
        verbose=1,
        mode='max'
    )
]

print("Callbacks configured:")
print(f"  - Early Stopping (patience=10)")
print(f"  - Reduce LR on Plateau (factor=0.5, patience=5)")
print(f"  - Model Checkpoint (best val_auc)")

In [None]:
# Train model
print("Starting model training...")
print(f"Batch size: 32")
print(f"Max epochs: 100")
print("=" * 60)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,           # Small batch size for stability
    callbacks=callbacks,
    verbose=1
)

print("\n" + "=" * 60)
print("Training complete!")

### 6.1 Training History Visualization

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[0, 0].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
axes[0, 0].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Accuracy
axes[0, 1].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
axes[0, 1].plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
axes[0, 1].set_title('Model Accuracy', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# AUC
axes[1, 0].plot(history.history['auc'], label='Train AUC', linewidth=2)
axes[1, 0].plot(history.history['val_auc'], label='Val AUC', linewidth=2)
axes[1, 0].set_title('Model AUC', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('AUC')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Precision & Recall
axes[1, 1].plot(history.history['precision'], label='Train Precision', linewidth=2)
axes[1, 1].plot(history.history['val_precision'], label='Val Precision', linewidth=2, linestyle='--')
axes[1, 1].plot(history.history['recall'], label='Train Recall', linewidth=2)
axes[1, 1].plot(history.history['val_recall'], label='Val Recall', linewidth=2, linestyle='--')
axes[1, 1].set_title('Precision & Recall', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Score')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print final metrics
print("\nFinal Training Metrics:")
print(f"  Train Loss: {history.history['loss'][-1]:.4f}")
print(f"  Val Loss: {history.history['val_loss'][-1]:.4f}")
print(f"  Train Accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"  Val Accuracy: {history.history['val_accuracy'][-1]:.4f}")
print(f"  Train AUC: {history.history['auc'][-1]:.4f}")
print(f"  Val AUC: {history.history['val_auc'][-1]:.4f}")

## 7. Model Evaluation

Evaluate on test set with comprehensive metrics:
- Accuracy
- AUC (Area Under ROC Curve)
- Precision, Recall, F1-Score
- Confusion Matrix