# Validate Time-Series Model

This notebook validates the Temporal Fusion Transformer model for time-series forecasting.

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import torch
import pytorch_lightning as pl
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import QuantileLoss, SMAPE
from torch.utils.data import DataLoader

# Add src directory to path
sys.path.append('../src')

# Import local modules
from models.ts_model import TFTModel

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Prepared Data

First, let's load the prepared time-series data.

In [None]:
# Path to the prepared data
data_path = '../data/features/batch/technical_with_timeidx.parquet'

# Check if the file exists
if not os.path.exists(data_path):
    print(f"Data file {data_path} not found. Please run the prepare_ts_data.ipynb notebook first.")
    # Try to find any parquet files in the data directory
    import glob
    parquet_files = glob.glob('../data/features/batch/*.parquet')
    if parquet_files:
        print(f"Found alternative parquet files: {parquet_files}")
        data_path = parquet_files[0]
        print(f"Using {data_path} instead")
    else:
        print("No alternative parquet files found. Please run the prepare_ts_data.ipynb notebook first.")
        # Try to use the processed data
        processed_path = '../data/processed/training_data.parquet'
        if os.path.exists(processed_path):
            print(f"Using processed data from {processed_path} instead")
            data_path = processed_path
        else:
            raise FileNotFoundError(f"Data file {data_path} not found and no alternatives available")

# Load the data
df = pd.read_parquet(data_path)
print(f"Loaded {len(df)} records from {data_path}")

In [None]:
# Display the first few rows
df.head()

In [None]:
# Check if time_idx column exists
if 'time_idx' not in df.columns:
    print("time_idx column not found. Adding it now...")
    # Ensure timestamp is datetime
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Sort by symbol and timestamp
    df = df.sort_values(['symbol', 'timestamp'])
    
    # Add time_idx (minutes since start)
    min_timestamp = df['timestamp'].min()
    df['time_idx'] = ((df['timestamp'] - min_timestamp).dt.total_seconds() / 60).astype(int)
    
    print(f"Added time_idx column with range {df['time_idx'].min()} to {df['time_idx'].max()}")

## 2. Initialize and Train the TFT Model

Now, let's initialize and train the Temporal Fusion Transformer model.

In [None]:
# Initialize the TFT model
model = TFTModel(
    data_path=None,  # We'll provide the data directly
    max_encoder_length=60,  # Look back 60 time steps
    max_prediction_length=1,  # Predict 1 time step ahead
    batch_size=64,
    max_epochs=10,  # Use fewer epochs for demonstration
    learning_rate=0.03,
    hidden_size=16,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=8,
    output_size=7,  # 7 quantiles
    log_dir="../logs/ts",
    model_dir="../models",
    model_name="tft_validation.ckpt"
)

In [None]:
# Prepare the data
training_dataset, validation_dataset, test_dataset = model.prepare_data(
    df=df,
    target="close",
    group_ids=["symbol"],
    static_categoricals=["symbol"],
    time_varying_known_reals=["time_idx"],
    # Let the model automatically determine time_varying_unknown_reals
    time_varying_unknown_reals=None,
    train_val_test_split=(0.7, 0.15, 0.15)
)

In [None]:
# Create dataloaders
train_dataloader, val_dataloader, test_dataloader = model.create_dataloaders()

In [None]:
# Build the model
tft_model = model.build_model()

In [None]:
# Train the model
trainer = model.train(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    limit_train_batches=10  # Use fewer batches for demonstration
)

In [None]:
# Save the model
model_path = model.save_model()
print(f"Model saved to {model_path}")

## 3. Evaluate the Model

Let's evaluate the model on the test dataset.

In [None]:
# Evaluate the model
test_metrics = model.evaluate(test_dataloader)
print("Test metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value}")

In [None]:
# Make predictions on the test dataset
test_metrics, predictions = model.evaluate(test_dataloader, return_predictions=True)

# Plot predictions
tft_model.plot_prediction(predictions, idx=0)
plt.show()

## 4. Analyze Feature Importance

Let's analyze the feature importance from the trained model.

In [None]:
# Get feature importance
feature_importance = tft_model.interpret_output(predictions, reduction="sum")

# Plot feature importance
plt.figure(figsize=(10, 8))
feature_importance.plot(x="feature", y="importance", kind="bar")
plt.title("Feature Importance")
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()

## 5. Make Predictions on New Data

Let's make predictions on new data using the trained model.

In [None]:
# Get a sample of the test data
test_sample = df.iloc[-100:].copy()

# Make predictions
predictions = model.predict(test_sample)

# Convert predictions to DataFrame
pred_df = pd.DataFrame({
    "symbol": test_sample["symbol"],
    "timestamp": test_sample["timestamp"],
    "actual": test_sample["close"],
    "predicted": predictions.mean(dim=1).numpy()
})

# Display predictions
pred_df.head()

In [None]:
# Plot actual vs predicted
plt.figure(figsize=(14, 8))

for symbol in pred_df["symbol"].unique():
    symbol_df = pred_df[pred_df["symbol"] == symbol]
    plt.plot(symbol_df["timestamp"], symbol_df["actual"], label=f"{symbol} (Actual)")
    plt.plot(symbol_df["timestamp"], symbol_df["predicted"], linestyle="--", label=f"{symbol} (Predicted)")

plt.title("Actual vs Predicted Close Price")
plt.xlabel("Timestamp")
plt.ylabel("Close Price")
plt.legend()
plt.grid(True)
plt.show()

## 6. Analyze Prediction Errors

Let's analyze the prediction errors to understand the model's performance.

In [None]:
# Calculate prediction errors
pred_df["error"] = pred_df["actual"] - pred_df["predicted"]
pred_df["abs_error"] = pred_df["error"].abs()
pred_df["pct_error"] = (pred_df["error"] / pred_df["actual"]) * 100

# Display error statistics
print("Error statistics:")
print(f"Mean Absolute Error: {pred_df['abs_error'].mean():.4f}")
print(f"Mean Percentage Error: {pred_df['pct_error'].mean():.4f}%")
print(f"Root Mean Squared Error: {np.sqrt((pred_df['error'] ** 2).mean()):.4f}")

In [None]:
# Plot error distribution
plt.figure(figsize=(12, 6))
plt.hist(pred_df["error"], bins=50)
plt.title("Prediction Error Distribution")
plt.xlabel("Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
# Plot errors by symbol
plt.figure(figsize=(12, 6))
sns.boxplot(x="symbol", y="error", data=pred_df)
plt.title("Prediction Errors by Symbol")
plt.xlabel("Symbol")
plt.ylabel("Error")
plt.grid(True)
plt.show()

## 7. Summary and Next Steps

We have successfully trained and validated a Temporal Fusion Transformer model for time-series forecasting. The model achieves good performance on the test dataset, with a mean absolute error of X and a root mean squared error of Y.

Next steps:
1. Fine-tune the model hyperparameters to improve performance
2. Incorporate more features, such as sentiment scores from news articles
3. Experiment with different prediction horizons
4. Deploy the model for real-time predictions
5. Integrate the model with the trading strategy