In [None]:
import os
import pandas as pd
import torch

from config import CONFIG

folder_path = CONFIG["train_data_folder"]
save_path = CONFIG["processed_dataset_path"]
use_datetime_index = CONFIG["use_datetime_index"]
normalization_mode = CONFIG["normalization_mode"]
percentile_value = CONFIG.get("percentile_value", 95)  # optional if using percentile
history_len = CONFIG["history_len"]
predict_len = CONFIG["predict_len"]


In [None]:
# This script loads multiple CSV files from a specified folder, stacks them into a single tensor, and creates sliding windows for time series data.
def load_and_stack_csvs(folder_path, use_datetime_index=False):
    feature_list = []
    index_ref = None

    # Loop over all CSVs in the folder
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(".csv"):
            path = os.path.join(folder_path, filename)
            df = pd.read_csv(path)

            # If datetime indexing is desired
            if use_datetime_index:
                # Assume first column is datetime if exists
                if not pd.api.types.is_datetime64_any_dtype(df.iloc[:,0]):
                    df.iloc[:,0] = pd.to_datetime(df.iloc[:,0])

                df = df.set_index(df.columns[0])

            else:
                # Integer index
                df.index = pd.RangeIndex(len(df))

            # Set reference index on first file
            if index_ref is None:
                index_ref = df.index
            else:
                # Check if index matches
                if not df.index.equals(index_ref):
                    raise ValueError(f"Index mismatch between files! {filename}")

            # Check only 1 feature column (after index)
            if df.shape[1] != 1:
                raise ValueError(f"Expected 1 feature column in {filename}, found {df.shape[1]}.")

            feature_list.append(torch.tensor(df.iloc[:,0].values, dtype=torch.float32))  # shape [T]

    # Stack features along last dimension
    stacked = torch.stack(feature_list, dim=-1)  # [T, n_features]

    print(f"✅ Loaded {len(feature_list)} features, stacked shape: {stacked.shape}")

    return stacked, index_ref

def create_sliding_windows(stacked_tensor, history_len, predict_len):
    total_seq_len = history_len + predict_len
    num_samples = stacked_tensor.shape[0] - total_seq_len + 1

    windows = []
    for i in range(num_samples):
        window = stacked_tensor[i : i + total_seq_len]  # [total_seq_len, n_features]
        windows.append(window)

    windows = torch.stack(windows, dim=0)  # [num_samples, total_seq_len, n_features]

    print(f"✅ Created {windows.shape[0]} sliding windows.")

    return windows

def calculate_normalization_params(stacked_tensor, mode="max", percentile_value=95):
    """
    Calculate normalization parameters based on chosen mode.
    """
    norm_params = {}

    if mode == "max":
        values = stacked_tensor.abs().max(dim=0)[0]
        norm_params = {"type": "max", "values": values}

    elif mode == "standard":
        mean = stacked_tensor.mean(dim=0)
        std = stacked_tensor.std(dim=0) + 1e-8
        norm_params = {"type": "standard", "mean": mean, "std": std}

    elif mode == "percentile":
        values = torch.quantile(stacked_tensor, percentile_value / 100.0, dim=0)
        norm_params = {"type": "percentile", "percentile": values}

    elif mode == "exp":
        norm_params = {"type": "exp"}

    elif mode == "none":
        norm_params = {"type": "none"}

    else:
        raise ValueError(f"Unknown normalization mode: {mode}")

    print(f"🔵 Calculated '{mode}' normalization parameters.")
    return norm_params


def apply_normalization(stacked_tensor, norm_params, mode="max"):
    """
    Apply normalization to stacked tensor.
    """
    if mode == "max":
        norm_values = norm_params["values"]
        normalized = stacked_tensor / (norm_values + 1e-8)

    elif mode == "standard":
        mean = norm_params["mean"]
        std = norm_params["std"]
        normalized = (stacked_tensor - mean) / (std + 1e-8)

    elif mode == "percentile":
        norm_values = norm_params["percentile"]
        normalized = stacked_tensor / (norm_values + 1e-8)

    elif mode == "exp":
        normalized = torch.log(stacked_tensor + 1e-8)

    elif mode == "none":
        normalized = stacked_tensor

    else:
        raise ValueError(f"Unknown normalization mode: {mode}")

    print(f"🔵 Applied '{mode}' normalization.")
    return normalized

# Example usage:
# stacked, index_ref = load_and_stack_csvs("./training_data", use_datetime_index=True)
# windows = create_sliding_windows(stacked, history_len=50, predict_len=20)


In [None]:
# Step 1: Load and stack features
stacked_tensor, index_ref = load_and_stack_csvs(folder_path, use_datetime_index)

# Step 2: Calculate normalization parameters
norm_params = calculate_normalization_params(stacked_tensor, mode=normalization_mode, percentile_value=percentile_value)

# Step 3: Apply normalization
stacked_tensor = apply_normalization(stacked_tensor, norm_params, mode=normalization_mode)

# Step 4: Preroll into sliding windows
windows = create_sliding_windows(stacked_tensor, history_len, predict_len)

# Step 5: Save dataset
save_dict = {
    "windows": windows,
    "norm_params": norm_params,
    "normalization_mode": normalization_mode,
    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
os.makedirs(os.path.dirname(save_path), exist_ok=True)
torch.save(save_dict, save_path)

print(f"✅ Saved processed dataset at: {save_path}")