# Synthetic Time Series Generation with GutenTAG

This notebook generates a 10-dimensional time series dataset (5,000 points) using [GutenTAG](https://github.com/TimeEval/GutenTAG).
**Characteristics:**
- **Dimensions:** 10
- **Length:** 5,000 points
- **Main Anomaly:** Spikes
- **Special Condition:** In 3 dimensions, the anomalies in the first half (0-2500) differ significantly from the second half (2500-5000).

In [5]:
import numpy as np
import pandas as pd
import os
import sys
from typing import Dict, Any
import matplotlib.pyplot as plt

# Setup paths
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
gutentag_path = os.path.join(project_root, "tools", "GutenTAG")
if gutentag_path not in sys.path:
    sys.path.append(gutentag_path)

try:
    from gutenTAG import GutenTAG
    print("GutenTAG imported successfully.")
except ImportError as e:
    print(f"Error importing GutenTAG: {e}")
    print("Please ensure GutenTAG submodule is initialized.")

DIMS = 10
LENGTH = 5000

GutenTAG imported successfully.


In [6]:
def create_config(dimensions: int = DIMS, length: int = LENGTH) -> Dict[str, Any]:
    half_length = length // 2
    
    # 1. Select 3 dimensions for drastic anomaly change
    # We'll just pick the first 3 for simplicity or random
    drift_dims = np.random.choice(range(dimensions), 3, replace=False)
    
    base_oscillations = []
    # Define Base Oscillations for all 10 dimensions
    for dim in range(dimensions):
        # Make it more irregular by adding higher variance and randomizing parameters more
        base_oscillations.append({
            "kind": "sine",
            "frequency": np.random.uniform(0.005, 0.1), # Broader range
            "amplitude": np.random.uniform(0.5, 2.0),
            "variance": np.random.uniform(0.1, 0.3) # Increased noise/irregularity
        })

    anomalies_config = []
    
    for dim in range(dimensions):
        # Define anomalies based on whether it's a 'drift' dimension or normal
        
        if dim in drift_dims:
            # === DRIFT DIMENSION ===
            # First Half: Small, frequent, local extremums (subtle spikes)
            for _ in range(10): # 10 anomalies in first half
                pos = np.random.randint(100, half_length - 100)
                anomalies_config.append({
                    "position": "middle", # Use valid enum value, exact-position overrides it
                    "exact-position": int(pos),
                    "length": 1,
                    "channel": int(dim),
                    "kinds": [
                        # Small local spike
                        {"kind": "extremum", "min": False, "local": True, "context_window": 5}
                    ]
                })
                
            # Second Half: Large, rare, massive mean shifts (huge spikes)
            for _ in range(5): # Fewer but bigger
                pos = np.random.randint(half_length + 100, length - 100)
                anomalies_config.append({
                    "position": "middle",
                    "exact-position": int(pos),
                    "length": 3, # Slightly wider spike
                    "channel": int(dim),
                    "kinds": [
                        {"kind": "mean", "offset": 5.0} # Huge jump relative to amp ~1.0
                    ]
                })
                
        else:
            # === NORMAL DIMENSION ===
            # Standard Spikes randomly distributed (mostly in second half for traditional AD feel?)
            # Or just scattered. Let's scatter them in the second half to keep 'train' clean for these.
            for _ in range(8):
                pos = np.random.randint(half_length, length - 10)
                anomalies_config.append({
                    "position": "middle",
                    "exact-position": int(pos),
                    "length": 1,
                    "channel": int(dim),
                    "kinds": [
                        {"kind": "mean", "offset": 3.0}
                    ]
                })

    config = {
        "timeseries": [
            {
                "name": "synthetic_spikes",
                "length": length,
                "base-oscillations": base_oscillations,
                "anomalies": anomalies_config
            }
        ]
    }
    
    return config, drift_dims

In [7]:
# Generate Data
np.random.seed(42)
config, drift_dims = create_config(dimensions=DIMS, length=LENGTH)

print(f"Drift Dimensions (Significant First/Second Half Diff): {drift_dims}")

gutentag = GutenTAG(seed=42)
gutentag.load_config_dict(config)
datasets = gutentag.generate(return_timeseries=True)
df = datasets[0].timeseries

# Add timestamps efficiently using vectorized operation
start_date = pd.Timestamp("2023-01-01")
df.insert(0, "timestamp", pd.date_range(start=start_date, periods=len(df), freq="min"))

# Split Train/Test (First half / Second half)
train_len = LENGTH // 2
label_col = "is_anomaly"

train_df = df.iloc[:train_len].copy().rename(columns={label_col: "label"})
test_df = df.iloc[train_len:].copy().rename(columns={label_col: "label"})

# Save datasets
output_dir = "../../datasets/synthetic"

for name, data in [("train", train_df), ("test", test_df)]:
    save_dir = os.path.join(output_dir, name)
    os.makedirs(save_dir, exist_ok=True)
    data.to_csv(os.path.join(save_dir, f"{name}.csv"), index=False)

print("Generation Complete.")
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

Drift Dimensions (Significant First/Second Half Diff): [8 1 5]


Initializing addons: 0it [00:00, ?it/s]
Generating datasets:   0%|          | 0/1 [00:00<?, ?it/s]
Finalizing addons: 0it [00:00, ?it/s]

Generation Complete.
Train: (2500, 12), Test: (2500, 12)



