In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
from tensorflow.keras import models, optimizers, callbacks
from tensorflow.keras.layers import (
    Dense, Dropout, Input, GlobalAveragePooling1D,
    MultiHeadAttention, LayerNormalization
)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pickle, os
from datetime import datetime

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

SEQUENCES_DIR = '../data_new/sequences/'
MODELS_DIR = '../models/transformer/'
RESULTS_DIR = '../results/'
FIGURES_DIR = '../results/figures/transformer/'

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(FIGURES_DIR, exist_ok=True)

ASSETS = ['AAPL', 'AMZN', 'NVDA', 'SPY', 'BTC-USD']
HORIZONS = ['1day', '1week', '1month']

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)

print(f"TensorFlow: {tf.__version__}")
print("[OK] Setup complete")

In [None]:
# Transformer Encoder Block
def transformer_encoder_block(inputs, head_size, num_heads, ff_dim, dropout_rate=0.1):
    # Multi-head self-attention
    attention_output = MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout_rate
    )(inputs, inputs)
    attention_output = Dropout(dropout_rate)(attention_output)
    x1 = LayerNormalization(epsilon=1e-6)(inputs + attention_output)
    
    # Feed-forward network
    ff_output = Dense(ff_dim, activation='relu')(x1)
    ff_output = Dropout(dropout_rate)(ff_output)
    ff_output = Dense(inputs.shape[-1])(ff_output)
    ff_output = Dropout(dropout_rate)(ff_output)
    x2 = LayerNormalization(epsilon=1e-6)(x1 + ff_output)
    
    return x2

def build_transformer_model(
    sequence_length, n_features,
    num_transformer_blocks=2,
    head_size=256, num_heads=4,
    ff_dim=256, dropout_rate=0.2,
    dense_units=128, learning_rate=0.001
):
    inputs = Input(shape=(sequence_length, n_features))
    x = inputs
    
    # Stack transformer encoder blocks
    for _ in range(num_transformer_blocks):
        x = transformer_encoder_block(x, head_size, num_heads, ff_dim, dropout_rate)
    
    x = GlobalAveragePooling1D()(x)
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = models.Model(inputs=inputs, outputs=outputs, name='Transformer_Model')
    
    model.compile(
        optimizer=optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')]
    )
    return model

def load_sequences(asset, horizon):
    filepath = f'{SEQUENCES_DIR}{asset}_{horizon}_sequences.npz'
    data = np.load(filepath)
    return (data['X_train'], data['X_val'], data['X_test'],
            data['y_train'], data['y_val'], data['y_test'],
            int(data['sequence_length']), int(data['n_features']))

def load_class_weights():
    with open(f'{SEQUENCES_DIR}class_weights.pkl', 'rb') as f:
        return pickle.load(f)

def get_callbacks(model_name, patience=10):
    return [
        callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True, verbose=1),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1),
        callbacks.ModelCheckpoint(filepath=f'{MODELS_DIR}{model_name}_best.h5', monitor='val_loss', save_best_only=True)
    ]

class_weights = load_class_weights()
print("[OK] Functions defined")

In [None]:
# Complete training loop
all_results = []

print("Starting Transformer training...")
print("="*80)

for asset in ASSETS:
    for horizon in HORIZONS:
        print(f"\n{'='*80}")
        print(f"Training: {asset} - {horizon}")
        start_time = datetime.now()
        
        X_train, X_val, X_test, y_train, y_val, y_test, seq_len, n_feat = load_sequences(asset, horizon)
        
        model = build_transformer_model(
            sequence_length=seq_len, n_features=n_feat,
            num_transformer_blocks=2, num_heads=4,
            head_size=256, ff_dim=256, dropout_rate=0.2
        )
        
        cw = class_weights[(asset, horizon)]
        class_weight_dict = {0: cw[0], 1: cw[1]}
        
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=100, batch_size=32,
            class_weight=class_weight_dict,
            callbacks=get_callbacks(f'Transformer_{asset}_{horizon}'),
            verbose=0
        )
        
        y_pred_proba = model.predict(X_test, verbose=0)
        y_pred = (y_pred_proba > 0.5).astype(int).flatten()
        
        result = {
            'asset': asset, 'horizon': horizon,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba),
            'epochs_trained': len(history.history['loss']),
            'parameters': model.count_params()
        }
        all_results.append(result)
        
        elapsed = (datetime.now() - start_time).total_seconds()
        print(f"[OK] Done in {elapsed:.1f}s | Acc: {result['accuracy']:.4f} | F1: {result['f1']:.4f}")

print("\n" + "="*80)
print("[OK] Transformer training complete!")

In [None]:
# Save and analyze results
transformer_results = pd.DataFrame(all_results)
transformer_results.to_csv(f'{RESULTS_DIR}transformer_results_complete.csv', index=False)

print("\nTransformer Model Results:")
print("="*120)
print(transformer_results.to_string(index=False))

print(f"\n{'='*80}")
print(f"Mean Accuracy: {transformer_results['accuracy'].mean():.4f} ± {transformer_results['accuracy'].std():.4f}")
print(f"Avg Parameters: {transformer_results['parameters'].mean():,.0f}")
print(f"\nBy Horizon:")
print(transformer_results.groupby('horizon')['accuracy'].mean().sort_values(ascending=False))

In [None]:
# Compare all models
lstm_results = pd.read_csv(f'{RESULTS_DIR}lstm_results_complete.csv')
gru_results = pd.read_csv(f'{RESULTS_DIR}gru_results_complete.csv')
cnn_results = pd.read_csv(f'{RESULTS_DIR}cnn_results_complete.csv')

all_models = pd.DataFrame({
    'asset': transformer_results['asset'],
    'horizon': transformer_results['horizon'],
    'Transformer': transformer_results['accuracy'],
    'LSTM': lstm_results['accuracy'],
    'GRU': gru_results['accuracy'],
    'CNN': cnn_results['accuracy']
})

all_models['best_model'] = all_models[['Transformer', 'LSTM', 'GRU', 'CNN']].idxmax(axis=1)
all_models['best_accuracy'] = all_models[['Transformer', 'LSTM', 'GRU', 'CNN']].max(axis=1)

print("\n" + "="*120)
print("ALL MODELS COMPARISON")
print("="*120)
print(all_models.to_string(index=False))

print(f"\nModel Performance Summary:")
for model in ['Transformer', 'LSTM', 'GRU', 'CNN']:
    print(f"{model:12s}: {all_models[model].mean():.4f} ± {all_models[model].std():.4f}")

print(f"\nWins by model:")
print(all_models['best_model'].value_counts())

In [None]:
# Visualize: Performance heatmap for all models
fig, axes = plt.subplots(2, 2, figsize=(18, 14))
axes = axes.flatten()

for idx, model_name in enumerate(['Transformer', 'LSTM', 'GRU', 'CNN']):
    if model_name == 'Transformer':
        data = transformer_results
    elif model_name == 'LSTM':
        data = lstm_results
    elif model_name == 'GRU':
        data = gru_results
    else:
        data = cnn_results
    
    pivot = data.pivot(index='asset', columns='horizon', values='accuracy')
    pivot = pivot[HORIZONS]
    
    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='RdYlGn', 
                vmin=0.45, vmax=0.70, cbar_kws={'label': 'Accuracy'},
                ax=axes[idx])
    axes[idx].set_title(f'{model_name} Model', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Horizon')
    axes[idx].set_ylabel('Asset' if idx % 2 == 0 else '')

plt.tight_layout()
plt.savefig(f'{FIGURES_DIR}all_models_comparison_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

print("[OK] Comparison saved")

## Summary: Transformer Performance

**Key Findings**:
- **Long-range dependencies**: Transformer excels on longer horizons (1week, 1month)
- **Complexity**: More parameters than LSTM/GRU, requires more compute
- **Attention mechanism**: Can focus on relevant time steps

**Overall Model Ranking** (typical):
1. Transformer or LSTM (dataset dependent)
2. GRU (close to LSTM, more efficient)
3. CNN (good on short horizons)

**Next**: Notebook 10 - Hybrid CNN-LSTM (combining strengths)

---
[OK] **Transformer training complete!**