In [15]:
# Import necessary libraries
import logging
import matplotlib.pyplot as plt
import os
import polars as pl
import seaborn as sns
import warnings
import tarfile

logging.basicConfig(level=logging.INFO)

warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid")
plt.rcParams.update(
    {
        "axes.edgecolor": "0.3",
        "axes.linewidth": 0.8,
        "font.size": 12,
        "axes.titlesize": 14,
        "axes.labelsize": 12,
        "axes.titleweight": "bold",
        "legend.fontsize": 10,
        "figure.dpi": 120,
        "legend.frameon": False,
    }
)

In [16]:
# unzip yahoo raw tgz file
raw_file_path = "../../datasets/Yahoo/raw/dataset.tgz"
extract_path = "../../datasets/Yahoo/"
unzip_dir = "../../datasets/Yahoo/data"

if not os.path.exists(unzip_dir):
    os.makedirs(unzip_dir)
    with tarfile.open(raw_file_path, "r:gz") as tar:
        tar.extractall(path=extract_path)
    logging.info(f"Extracted {raw_file_path} to {extract_path}")
    os.rename(
        "../../datasets/Yahoo/ydata-labeled-time-series-anomalies-v1_0",
        "../../datasets/Yahoo/data",
    )
else:
    logging.info(f"Directory {unzip_dir} already exists. Skipping extraction.")

INFO:root:Directory ../../datasets/Yahoo/data already exists. Skipping extraction.


In [17]:
# Yahoo A1Benchmark
data_dir = "../../datasets/Yahoo/data/A1Benchmark"

# rename all real_XX.csv to XX.csv
# for file in os.listdir(data_dir):
#     if file.startswith("real_") and file.endswith(".csv"):
#         df = pl.read_csv(os.path.join(data_dir, file))
#         df.rename({"is_anomaly": "label"}).write_csv(
#             os.path.join(data_dir, file.replace("real_", ""))
#         )
#         os.remove(os.path.join(data_dir, file))
#         logging.info(f"Processed {file}.")

# Read a sample file
sample_file = os.path.join(data_dir, "1.csv")
sample = pl.read_csv(
    sample_file,
    schema={"timestamp": pl.UInt64, "value": pl.Float64, "label": pl.Int8},
)
print(f"Sample file: 1.csv")
print(f"Shape: {sample.shape}")
print(f"Columns: {sample.columns}")
sample.head()

Sample file: 1.csv
Shape: (1420, 3)
Columns: ['timestamp', 'value', 'label']


timestamp,value,label
u64,f64,i8
1,0.0,0
2,0.091758,0
3,0.172297,0
4,0.226219,0
5,0.176358,0


In [18]:
# plotting function
def plot_yahoo_timeseries(
    df: pl.DataFrame, split_at: float = 0.5, figsize=(14, 6), title=None
):
    """
    Plot a single Yahoo time series, marking anomalies and distinguishing train/test sets

    Args:
        df: DataFrame containing 'timestamp', 'value', and 'label' columns
        split_at: fraction to split train/test sets (default: 0.5)
        figsize: figure size (default: (14, 6))
        title: custom title for the plot (optional)

    Returns:
        fig: matplotlib figure object
    """

    fig, ax = plt.subplots(1, 1, figsize=figsize)

    # Extract data
    timestamps = df["timestamp"]
    values = df["value"]
    labels = df["label"]

    length = len(df)
    train_cutoff = timestamps[int(length * split_at) - 1]

    # Split train and test sets
    train_mask = timestamps <= train_cutoff
    test_mask = timestamps > train_cutoff

    # Train set
    train_timestamps = timestamps.filter(train_mask)
    train_values = values.filter(train_mask)
    train_labels = labels.filter(train_mask)

    # Test set
    test_timestamps = timestamps.filter(test_mask)
    test_values = values.filter(test_mask)
    test_labels = labels.filter(test_mask)

    # Plot train set (blue)
    if len(train_timestamps) > 0:
        ax.plot(
            train_timestamps,
            train_values,
            color="#2E86AB",
            linewidth=1.5,
            alpha=0.8,
            label="Train",
        )
        # Train set anomalies
        train_anomaly_mask = train_labels == 1
        if train_anomaly_mask.sum() > 0:
            ax.scatter(
                train_timestamps.filter(train_anomaly_mask),
                train_values.filter(train_anomaly_mask),
                color="#E63946",
                s=40,
                marker="o",
                zorder=5,
                edgecolors="darkred",
                linewidths=1.2,
            )

    # Plot test set (green)
    if len(test_timestamps) > 0:
        ax.plot(
            test_timestamps,
            test_values,
            color="#06A77D",
            linewidth=1.5,
            alpha=0.8,
            label="Test",
        )
        # Test set anomalies
        test_anomaly_mask = test_labels == 1
        if test_anomaly_mask.sum() > 0:
            ax.scatter(
                test_timestamps.filter(test_anomaly_mask),
                test_values.filter(test_anomaly_mask),
                color="#E63946",
                s=40,
                marker="o",
                zorder=5,
                edgecolors="darkred",
                linewidths=1.2,
                label="Anomaly",
            )

    # Add vertical line marking train/test split
    ax.axvline(x=train_cutoff, color="gray", linestyle="--", linewidth=2, alpha=0.5)

    # Statistics
    total_anomalies = (labels == 1).sum()
    train_anomalies = (train_labels == 1).sum()
    test_anomalies = (test_labels == 1).sum()

    # Set title
    if title is None:
        title = (
            f"Total: {len(values)} pts | "
            f"Train: {len(train_values)} pts ({train_anomalies} anomalies) | "
            f"Test: {len(test_values)} pts ({test_anomalies} anomalies)"
        )

    ax.set_title(
        title,
        fontsize=13,
        fontweight="bold",
        pad=10,
    )

    ax.set_xlabel("Timestamp (hours)", fontsize=12, fontweight="bold")
    ax.set_ylabel("Value", fontsize=12, fontweight="bold")

    # Beautify plot
    ax.grid(True, alpha=0.3, linestyle="--", linewidth=1)

    ax.legend(
        loc="upper left",
        bbox_to_anchor=(0, 1),
        fontsize=10,
        framealpha=0.95,
        edgecolor="gray",
        fancybox=True,
    )

    # Remove top and right spines
    sns.despine(ax=ax)

    plt.tight_layout()
    return fig

In [19]:
# remove 7 from all ids
ids = list(range(1, 68))
ids.remove(7)
if os.path.exists(os.path.join(data_dir, "7.csv")):
    os.remove(os.path.join(data_dir, "7.csv"))
    logging.info("Removed file 7.csv due to known data issues.")
os.makedirs("../../figures/datasets/YaHoo", exist_ok=True)
for id in ids:
    file_path = os.path.join(data_dir, f"{id}.csv")
    df = pl.read_csv(
        file_path,
        schema={"timestamp": pl.UInt64, "value": pl.Float64, "label": pl.Int8},
    )

    fig = plot_yahoo_timeseries(
        df,
        figsize=(14, 6),
        title=f"Yahoo Time Series - {id}",
    )
    fig.savefig(
        f"../../figures/datasets/YaHoo/A1-{id}.png", dpi=150, bbox_inches="tight"
    )
    plt.close(fig)
    logging.info(f"Saved plot for File {id} as A1-{id}.png")

INFO:root:Saved plot for File 1 as A1-1.png
INFO:root:Saved plot for File 2 as A1-2.png
INFO:root:Saved plot for File 3 as A1-3.png
INFO:root:Saved plot for File 4 as A1-4.png
INFO:root:Saved plot for File 5 as A1-5.png
INFO:root:Saved plot for File 6 as A1-6.png
INFO:root:Saved plot for File 8 as A1-8.png
INFO:root:Saved plot for File 9 as A1-9.png
INFO:root:Saved plot for File 10 as A1-10.png
INFO:root:Saved plot for File 11 as A1-11.png
INFO:root:Saved plot for File 12 as A1-12.png
INFO:root:Saved plot for File 13 as A1-13.png
INFO:root:Saved plot for File 14 as A1-14.png
INFO:root:Saved plot for File 15 as A1-15.png
INFO:root:Saved plot for File 16 as A1-16.png
INFO:root:Saved plot for File 17 as A1-17.png
INFO:root:Saved plot for File 18 as A1-18.png
INFO:root:Saved plot for File 19 as A1-19.png
INFO:root:Saved plot for File 20 as A1-20.png
INFO:root:Saved plot for File 21 as A1-21.png
INFO:root:Saved plot for File 22 as A1-22.png
INFO:root:Saved plot for File 23 as A1-23.png
INFO

In [20]:
train_dir = "../../datasets/Yahoo/train/A1"
test_dir = "../../datasets/Yahoo/test/A1"

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

split_at = 0.5

for id in ids:
    file_path = os.path.join(data_dir, f"{id}.csv")
    df = pl.read_csv(
        file_path,
        schema={"timestamp": pl.UInt64, "value": pl.Float64, "label": pl.Int8},
    )

    cutoff = df["timestamp"][int(len(df) * split_at) - 1]
    train_df = df.filter(pl.col("timestamp") <= cutoff)
    test_df = df.filter(pl.col("timestamp") > cutoff)

    train_df.write_csv(os.path.join(train_dir, f"{id}.csv"))
    test_df.write_csv(os.path.join(test_dir, f"{id}.csv"))

    print(
        f"Saved File {id}: Train ({train_df.shape[0]} pts), Test ({test_df.shape[0]} pts)"
    )

Saved File 1: Train (710 pts), Test (710 pts)
Saved File 2: Train (719 pts), Test (720 pts)
Saved File 3: Train (730 pts), Test (731 pts)
Saved File 4: Train (711 pts), Test (712 pts)
Saved File 5: Train (719 pts), Test (720 pts)
Saved File 6: Train (719 pts), Test (720 pts)
Saved File 8: Train (710 pts), Test (710 pts)
Saved File 9: Train (730 pts), Test (731 pts)
Saved File 10: Train (719 pts), Test (720 pts)
Saved File 11: Train (719 pts), Test (720 pts)
Saved File 12: Train (719 pts), Test (720 pts)
Saved File 13: Train (719 pts), Test (720 pts)
Saved File 14: Train (719 pts), Test (720 pts)
Saved File 15: Train (719 pts), Test (720 pts)
Saved File 16: Train (730 pts), Test (731 pts)
Saved File 17: Train (712 pts), Test (712 pts)
Saved File 18: Train (730 pts), Test (731 pts)
Saved File 19: Train (712 pts), Test (712 pts)
Saved File 20: Train (711 pts), Test (711 pts)
Saved File 21: Train (710 pts), Test (710 pts)
Saved File 22: Train (710 pts), Test (710 pts)
Saved File 23: Train 

In [21]:
stats = []

for id in ids:
    file_path = os.path.join(data_dir, f"{id}.csv")
    df = pl.read_csv(
        file_path,
        schema={"timestamp": pl.UInt64, "value": pl.Float64, "label": pl.Int8},
    )

    cutoff = df["timestamp"][int(len(df) * split_at) - 1]
    total_points = len(df)
    total_anomalies = (df["label"] == 1).sum()

    train_df = df.filter(pl.col("timestamp") <= cutoff)
    train_points = len(train_df)
    train_anomalies = (train_df["label"] == 1).sum()

    test_df = df.filter(pl.col("timestamp") > cutoff)
    test_points = len(test_df)
    test_anomalies = (test_df["label"] == 1).sum()

    stats.append(
        {
            "ID": id,
            "Total Points": total_points,
            "Total Anomalies": total_anomalies,
            "Anomaly Rate (%)": total_anomalies / total_points * 100,
            "Train Points": train_points,
            "Train Anomalies": train_anomalies,
            "Train Anomaly Rate (%)": train_anomalies / train_points * 100,
            "Test Points": test_points,
            "Test Anomalies": test_anomalies,
            "Test Anomaly Rate (%)": test_anomalies / test_points * 100,
        }
    )

stats_df = pl.DataFrame(stats)
print("Yahoo Time Series Statistics Summary")
stats_df

Yahoo Time Series Statistics Summary


ID,Total Points,Total Anomalies,Anomaly Rate (%),Train Points,Train Anomalies,Train Anomaly Rate (%),Test Points,Test Anomalies,Test Anomaly Rate (%)
i64,i64,i64,f64,i64,i64,f64,i64,i64,f64
1,1420,2,0.140845,710,0,0.0,710,2,0.28169
2,1439,12,0.833912,719,0,0.0,720,12,1.666667
3,1461,14,0.958248,730,0,0.0,731,14,1.915185
4,1423,10,0.702741,711,1,0.140647,712,9,1.264045
5,1439,2,0.138985,719,2,0.278164,720,0,0.0
…,…,…,…,…,…,…,…,…,…
63,1439,8,0.555942,719,0,0.0,720,8,1.111111
64,1441,0,0.0,720,0,0.0,721,0,0.0
65,1424,17,1.19382,712,0,0.0,712,17,2.38764
66,1424,21,1.474719,712,0,0.0,712,21,2.949438
