### Model 3: Transformer (With Dimensionality Reduction)
Architecture
Input: PCA-reduced features, reshaped for transformer input

Layers: Transformer Encoder → Dense

In [None]:
from tensorflow.keras.layers import Input, Dense, LayerNormalization, MultiHeadAttention, Dropout, Flatten
from tensorflow.keras.models import Model

def build_transformer(input_shape, num_heads=4, ff_dim=64, num_layers=2):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_layers):
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=ff_dim)(x, x)
        x = LayerNormalization(epsilon=1e-6)(x + attn_output)
        ffn_output = Dense(ff_dim, activation='relu')(x)
        x = LayerNormalization(epsilon=1e-6)(x + ffn_output)
    x = Flatten()(x)
    outputs = Dense(1)(x)
    return Model(inputs, outputs)

# Reshape for transformer ([samples, timesteps, features])
X_train_trans = X_train_reduced.reshape(-1, timesteps, X_train_reduced.shape[1] // timesteps)
X_test_trans = X_test_reduced.reshape(-1, timesteps, X_test_reduced.shape[1] // timesteps)

start_time = time.time()
tracemalloc.start()

transformer = build_transformer(X_train_trans.shape[1:])
transformer.compile(optimizer='adam', loss='mse')

history = transformer.fit(X_train_trans, y_train[:X_train_trans.shape[0]], epochs=100, batch_size=32, validation_split=0.2, verbose=0)

train_time = time.time() - start_time
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
epochs_to_converge = len(history.history['loss'])

start_inf = time.time()
y_pred = transformer.predict(X_test_trans)
inf_time = time.time() - start_inf
