Approach
- We first create a synthetic time series with some injected outliers.
The model consists of:
- An embedding layer for feature transformation.
- A multi-head self-attention layer to focus on important (non-outlier) values.
- A position-wise feedforward network for feature extraction.
- The model is trained to predict the next time step, learning to reduce the influence of anomalous inputs.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# Generate synthetic time series data with outliers
def generate_time_series(seq_length=1000, outlier_fraction=0.05):
    t = np.linspace(0, 40, seq_length)
    series = np.sin(t) + 0.1 * np.random.randn(seq_length)
    
    # Inject outliers
    num_outliers = int(seq_length * outlier_fraction)
    outlier_indices = np.random.choice(seq_length, num_outliers, replace=False)
    series[outlier_indices] += np.random.uniform(-3, 3, num_outliers)
    
    return series

# Transformer-based attention model for outlier suppression
class AttentionOutlierSuppression(nn.Module):
    def __init__(self, input_dim=1, embed_dim=16, n_heads=4, hidden_dim=32, num_layers=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.attention = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=n_heads)
        self.encoder = nn.TransformerEncoder(self.attention, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        x = self.fc(x)
        return x

# Data preparation
series = generate_time_series()
seq_length = 20
X, y = [], []
for i in range(len(series) - seq_length):
    X.append(series[i:i+seq_length])
    y.append(series[i+seq_length])
X, y = np.array(X), np.array(y)
X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)  # Add feature dim
y = torch.tensor(y, dtype=torch.float32)

# Model training
model = AttentionOutlierSuppression()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(100):
    optimizer.zero_grad()
    output = model(X)
    loss = criterion(output.squeeze(), y)
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# Predict and visualize
with torch.no_grad():
    predictions = model(X).squeeze().numpy()
plt.plot(y.numpy(), label="True Values")
plt.plot(predictions, label="Predictions")
plt.legend()
plt.show()
