In [13]:
# Clear the Kaggle working directory
import os
import shutil

# The directory to clear
folder = '/kaggle/working/'

# Loop through everything in the folder and delete it
for filename in os.listdir(folder):
    file_path = os.path.join(folder, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
    except Exception as e:
        print(f'Failed to delete {file_path}. Reason: {e}')

print("✅ /kaggle/working/ directory has been cleared.")

✅ /kaggle/working/ directory has been cleared.


## VAR+GRU

In [21]:
"""
VAR-GRU Hybrid (VRT Style) - KAGGLE VERSION
-------------------------------------------
Architecture:
1. Linear Stream: VAR (Vector Autoregression) for baseline trend.
2. Non-Linear Stream: GRU (Gated Recurrent Unit) for residual correction.
3. Fusion: Forecast = VAR_Baseline + GRU_Correction.
"""

import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.api import VAR

# ==========================================
# 0. Kaggle Configuration & Setup
# ==========================================
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU Memory Growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("[GPU] Acceleration Enabled")
    except RuntimeError as e:
        print(e)

CONFIG = {
    # Path will be auto-detected
    'dataset_path': None, 
    
    # Data params
    'train_split': 0.7,
    'window_size': 24,       
    'forecast_horizon': 1,   
    
    # VAR params
    'var_max_lags': 15,
    
    # GRU Hyperparameters
    'gru_units': 64,         
    'dense_units': 32,
    'dropout': 0.1,
    'learning_rate': 1e-3,
    'batch_size': 32,
    'epochs': 100
}

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ==========================================
# 1. Advanced Data Loading & Engineering
# ==========================================
def find_dataset_file():
    """Auto-detects the dataset file in Kaggle input directory"""
    search_path = '/kaggle/input'
    print(f"[SEARCH] Looking for dataset in {search_path}...")
    
    for root, dirs, files in os.walk(search_path):
        for file in files:
            if file.endswith(".csv") or file.endswith(".txt") or file.endswith(".dat"):
                full_path = os.path.join(root, file)
                print(f"[FOUND] Dataset located: {full_path}")
                return full_path
    
    raise FileNotFoundError("Could not find a dataset file in /kaggle/input")

def load_and_engineer_features(filepath):
    print(f"\n[IO] Loading raw data: {filepath}")
    
    try:
        # Try reading as standard CSV first
        try:
            df = pd.read_csv(filepath)
            if df.shape[1] < 2 or '::' in str(df.iloc[0,0]):
                raise ValueError("Likely raw format")
        except:
            print("  > Detected raw format (parsing '::')...")
            df = pd.read_csv(filepath, sep='\t', header=None, names=['slice_timestamp', 'bytes', 'packets'])
            
            if df.shape[1] == 1:
                 df = pd.read_csv(filepath, sep=',', header=None, names=['slice_timestamp', 'bytes', 'packets'])

            split_data = df['slice_timestamp'].str.split('::', expand=True)
            df['slice_type'] = split_data[0]
            df['timestamp'] = pd.to_numeric(split_data[1])
            df['bytes'] = pd.to_numeric(df['bytes'], errors='coerce')
            df['packets'] = pd.to_numeric(df['packets'], errors='coerce')

        if 'slice_label' in df.columns: df.rename(columns={'slice_label': 'slice_type'}, inplace=True)
        
        processed_slices = {}
        
        group_col = 'slice_type' if 'slice_type' in df.columns else 'slice_id'
        if group_col not in df.columns:
            df['slice_type'] = 'Default_Slice'
            group_col = 'slice_type'

        for slice_id in df[group_col].unique():
            print(f"  > Processing Slice: {slice_id}")
            slice_df = df[df[group_col] == slice_id].sort_values('timestamp').copy()
            
            if len(slice_df) < 500: continue 

            # 1. Map Columns
            if 'sum_bytes' in slice_df.columns: slice_df['throughput'] = slice_df['sum_bytes']
            elif 'bytes' in slice_df.columns: slice_df['throughput'] = slice_df['bytes']
            
            if 'sum_packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['sum_packets']
            elif 'packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['packets']
            
            # 2. Engineering: Velocity (Diff) & Volatility (Std)
            slice_df['throughput_diff'] = slice_df['throughput'].diff()
            slice_df['packet_diff'] = slice_df['packet_rate'].diff()
            slice_df['volatility'] = slice_df['throughput'].rolling(5).std().fillna(0)
            
            # 3. Select Features
            cols = ['throughput', 'packet_rate', 'throughput_diff', 'packet_diff', 'volatility']
            final_df = slice_df[cols].dropna()
            
            # 4. Clip Outliers
            p99 = final_df.quantile(0.99)
            final_df = final_df.clip(upper=p99, axis=1)
            
            processed_slices[slice_id] = final_df
            
        return processed_slices

    except Exception as e:
        print(f"[ERROR] Loading failed: {e}")
        return {}

# ==========================================
# 2. Strict Leakage-Free VAR Baseline
# ==========================================
def get_var_residuals(train_df, test_df, maxlags):
    # Noise injection for constant columns
    for col in train_df.columns:
        if train_df[col].nunique() <= 1:
            train_df[col] += np.random.normal(0, 1e-6, size=len(train_df))

    # 1. Fit VAR on Train
    model = VAR(train_df)
    try:
        lag_order_res = model.select_order(maxlags=maxlags)
        lag_order = lag_order_res.aic
        if lag_order < 1: lag_order = 10
    except:
        lag_order = 10
        
    var_results = model.fit(lag_order)
    print(f"  [VAR] Fitted with Lag Order: {lag_order}")
    
    # 2. Train Residuals
    train_pred = var_results.fittedvalues
    train_actual = train_df.iloc[lag_order:]
    train_residuals = train_actual - train_pred

    # 3. Test Baseline (Rolling Forecast)
    coefs = var_results.coefs
    intercept = var_results.intercept
    
    history = pd.concat([train_df.iloc[-lag_order:], test_df])
    history_values = history.values
    
    test_preds = []
    
    for i in range(lag_order, len(history_values)):
        window = history_values[i-lag_order : i]
        window_reversed = window[::-1]
        
        pred = intercept.copy()
        for l in range(lag_order):
            pred += np.dot(coefs[l], window_reversed[l])
            
        test_preds.append(pred)
        
    test_pred_df = pd.DataFrame(test_preds, index=test_df.index, columns=test_df.columns)
    
    # 4. Test Residuals
    test_residuals = test_df - test_pred_df
    
    return train_residuals, test_residuals, test_pred_df

# ==========================================
# 3. GRU Model Architecture (VRT Style)
# ==========================================
def build_gru_model(input_shape, output_dim, config):
    inputs = layers.Input(shape=input_shape)
    
    # --- GRU Layer ---
    # Using return_sequences=True followed by Pooling is often more stable
    x = layers.GRU(config['gru_units'], return_sequences=True, activation='tanh', name="GRU_Layer")(inputs)
    x = layers.Dropout(config['dropout'])(x)
    
    # --- Decoding ---
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(config['dense_units'], activation='relu')(x)
    
    # Final Linear Projection
    outputs = layers.Dense(output_dim, activation='linear', name="Residual_Output")(x)
    
    model = models.Model(inputs=inputs, outputs=outputs, name="VAR_GRU_Hybrid")
    
    optimizer = optimizers.Adam(learning_rate=config['learning_rate'])
    # Using Huber loss for robustness (Standard VRT Feature)
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    
    return model

# ==========================================
# 4. Helpers
# ==========================================
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

# ==========================================
# 5. Main Execution Pipeline
# ==========================================
def train_evaluate_slice(slice_name, df):
    print(f"\n{'='*40}\n PROCESSING: {slice_name}\n{'='*40}")
    
    # A. Split Data
    split_idx = int(len(df) * CONFIG['train_split'])
    train_raw = df.iloc[:split_idx]
    test_raw = df.iloc[split_idx:]
    
    if len(test_raw) < CONFIG['window_size'] + 50:
        print("  [SKIP] Not enough data for testing.")
        return None

    # B. The Linear Baseline (VAR)
    print("  [1/4] Calculating VAR Residuals...")
    try:
        train_res, test_res, test_var_baseline = get_var_residuals(
            train_raw, test_raw, CONFIG['var_max_lags']
        )
    except Exception as e:
        print(f"  [VAR FAILED] {e}. Skipping slice.")
        return None
    
    # C. Preprocessing for Deep Learning
    print("  [2/4] Scaling & Windowing...")
    scaler = StandardScaler()
    train_res_scaled = scaler.fit_transform(train_res)
    test_res_scaled = scaler.transform(test_res)
    
    X_train, y_train = create_windows(train_res_scaled, CONFIG['window_size'])
    X_test, y_test = create_windows(test_res_scaled, CONFIG['window_size'])
    
    if len(X_train) == 0: return None

    # D. Train GRU Model
    print(f"  [3/4] Training GRU ({len(X_train)} samples)...")
    model = build_gru_model(
        input_shape=(CONFIG['window_size'], X_train.shape[2]),
        output_dim=y_train.shape[1],
        config=CONFIG
    )
    
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )
    print(f"      > Final Val Loss: {history.history['val_loss'][-1]:.5f}")
    
    # E. Final Prediction & Fusion
    print("  [4/4] Forecasting & Fusion...")
    
    # 1. Predict Scaled Residuals
    pred_res_scaled = model.predict(X_test, verbose=0)
    
    # 2. Inverse Scale -> Real Residuals
    pred_res = scaler.inverse_transform(pred_res_scaled)
    
    # 3. Align Baseline
    baseline_aligned = test_var_baseline.iloc[CONFIG['window_size']:].values
    y_true_aligned = test_raw.iloc[CONFIG['window_size']:].values
    
    # Truncate to matching lengths
    min_len = min(len(baseline_aligned), len(pred_res))
    baseline_aligned = baseline_aligned[:min_len]
    y_true_aligned = y_true_aligned[:min_len]
    pred_res = pred_res[:min_len]
    
    # 4. FUSION
    final_forecast = baseline_aligned + pred_res
    final_forecast = np.maximum(final_forecast, 0)
    
    # Metrics
    mse = np.mean((y_true_aligned - final_forecast)**2, axis=0)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true_aligned - final_forecast), axis=0)
    
    # Plotting
    plt.figure(figsize=(12, 4))
    idx = 0 # Throughput
    plt.plot(y_true_aligned[:, idx], label='Actual', color='black', alpha=0.6)
    plt.plot(baseline_aligned[:, idx], label='VAR Only', color='orange', linestyle='--', alpha=0.7)
    plt.plot(final_forecast[:, idx], label='VAR+GRU (Final)', color='green', linewidth=1.5)
    plt.title(f"Slice: {slice_name} | RMSE: {rmse[idx]:.2f}")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'slice': slice_name,
        'rmse_throughput': rmse[0],
        'mae_throughput': mae[0]
    }

def main():
    try:
        # Auto-detect file
        filepath = find_dataset_file()
        CONFIG['dataset_path'] = filepath
        
        slices_data = load_and_engineer_features(filepath)
        
        results = []
        for name, data in slices_data.items():
            res = train_evaluate_slice(name, data)
            if res: results.append(res)
            
        if results:
            res_df = pd.DataFrame(results)
            print("\nFinal Results Summary (VAR + GRU):")
            print(res_df)
    except Exception as e:
        print(f"Critical Error: {e}")

if __name__ == "__main__":
    main()

Physical devices cannot be modified after being initialized
[SEARCH] Looking for dataset in /kaggle/input...
[FOUND] Dataset located: /kaggle/input/feature-extracted/part-00000.csv

[IO] Loading raw data: /kaggle/input/feature-extracted/part-00000.csv
  > Processing Slice: MMTC
  > Processing Slice: Naver
  > Processing Slice: Youtube

 PROCESSING: MMTC
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 10
  [2/4] Scaling & Windowing...
  [3/4] Training GRU (1379 samples)...
      > Final Val Loss: 0.15343
  [4/4] Forecasting & Fusion...

 PROCESSING: Naver
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU (2765 samples)...
      > Final Val Loss: 0.19457
  [4/4] Forecasting & Fusion...

 PROCESSING: Youtube
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU (10483 samples)...
      > Final Val Loss: 0.47434
  [4/4] Forecasting & Fusion...

Final Results Summary (VAR + GRU):
     slice  rmse_throughput  mae_throughput
0     MMTC      5983.788178     1085.507324
1    Naver    899320.757439   654366.283389
2  Youtube    964825.491253   779503.070344


## VAR+GRU+TFT

In [20]:
"""
Hybrid VAR-GRU-TFT (Temporal Fusion Transformer) - KAGGLE VERSION
-----------------------------------------------------------------
Architecture:
1. Linear Stream: VAR (Vector Autoregression) for baseline trend.
2. Non-Linear Stream: GRU + Multi-Head Attention for residual correction.
3. Fusion: Forecast = VAR_Baseline + (GRU+TFT)_Correction.
"""

import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.layers import Input, GRU, Dense, Dropout, MultiHeadAttention, LayerNormalization, Add, GlobalAveragePooling1D
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.api import VAR

# ==========================================
# 0. Kaggle Configuration & Setup
# ==========================================
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU Memory Growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("[GPU] Acceleration Enabled")
    except RuntimeError as e:
        print(e)

CONFIG = {
    # Path will be auto-detected
    'dataset_path': None, 
    
    # Data params
    'train_split': 0.7,
    'window_size': 24,       
    'forecast_horizon': 1,   
    
    # VAR params
    'var_max_lags': 15,
    
    # Model Hyperparameters (GRU + TFT)
    'gru_units': 64,         # Size of GRU hidden state
    'head_size': 64,         # Key dimension for Attention
    'num_heads': 4,          # Number of Attention Heads
    'dropout': 0.15,
    'learning_rate': 1e-3,
    'batch_size': 32,
    'epochs': 150
}

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ==========================================
# 1. Advanced Data Loading & Engineering
# ==========================================
def find_dataset_file():
    """Auto-detects the dataset file in Kaggle input directory"""
    search_path = '/kaggle/input'
    print(f"[SEARCH] Looking for dataset in {search_path}...")
    
    for root, dirs, files in os.walk(search_path):
        for file in files:
            if file.endswith(".csv") or file.endswith(".txt") or file.endswith(".dat"):
                full_path = os.path.join(root, file)
                print(f"[FOUND] Dataset located: {full_path}")
                return full_path
    
    raise FileNotFoundError("Could not find a dataset file in /kaggle/input")

def load_and_engineer_features(filepath):
    print(f"\n[IO] Loading raw data: {filepath}")
    
    try:
        # Try reading as standard CSV first
        try:
            df = pd.read_csv(filepath)
            if df.shape[1] < 2 or '::' in str(df.iloc[0,0]):
                raise ValueError("Likely raw format")
        except:
            print("  > Detected raw format (parsing '::')...")
            df = pd.read_csv(filepath, sep='\t', header=None, names=['slice_timestamp', 'bytes', 'packets'])
            
            if df.shape[1] == 1:
                 df = pd.read_csv(filepath, sep=',', header=None, names=['slice_timestamp', 'bytes', 'packets'])

            split_data = df['slice_timestamp'].str.split('::', expand=True)
            df['slice_type'] = split_data[0]
            df['timestamp'] = pd.to_numeric(split_data[1])
            df['bytes'] = pd.to_numeric(df['bytes'], errors='coerce')
            df['packets'] = pd.to_numeric(df['packets'], errors='coerce')

        if 'slice_label' in df.columns: df.rename(columns={'slice_label': 'slice_type'}, inplace=True)
        
        processed_slices = {}
        
        group_col = 'slice_type' if 'slice_type' in df.columns else 'slice_id'
        if group_col not in df.columns:
            df['slice_type'] = 'Default_Slice'
            group_col = 'slice_type'

        for slice_id in df[group_col].unique():
            print(f"  > Processing Slice: {slice_id}")
            slice_df = df[df[group_col] == slice_id].sort_values('timestamp').copy()
            
            if len(slice_df) < 500: continue 

            # 1. Map Columns
            if 'sum_bytes' in slice_df.columns: slice_df['throughput'] = slice_df['sum_bytes']
            elif 'bytes' in slice_df.columns: slice_df['throughput'] = slice_df['bytes']
            
            if 'sum_packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['sum_packets']
            elif 'packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['packets']
            
            # 2. Engineering: Velocity (Diff) & Volatility (Std)
            slice_df['throughput_diff'] = slice_df['throughput'].diff()
            slice_df['packet_diff'] = slice_df['packet_rate'].diff()
            slice_df['volatility'] = slice_df['throughput'].rolling(5).std().fillna(0)
            
            # 3. Select Features
            cols = ['throughput', 'packet_rate', 'throughput_diff', 'packet_diff', 'volatility']
            final_df = slice_df[cols].dropna()
            
            # 4. Clip Outliers
            p99 = final_df.quantile(0.99)
            final_df = final_df.clip(upper=p99, axis=1)
            
            processed_slices[slice_id] = final_df
            
        return processed_slices

    except Exception as e:
        print(f"[ERROR] Loading failed: {e}")
        return {}

# ==========================================
# 2. Strict Leakage-Free VAR Baseline
# ==========================================
def get_var_residuals(train_df, test_df, maxlags):
    # Noise injection for constant columns
    for col in train_df.columns:
        if train_df[col].nunique() <= 1:
            train_df[col] += np.random.normal(0, 1e-6, size=len(train_df))

    # 1. Fit VAR on Train
    model = VAR(train_df)
    try:
        lag_order_res = model.select_order(maxlags=maxlags)
        lag_order = lag_order_res.aic
        if lag_order < 1: lag_order = 10
    except:
        lag_order = 10
        
    var_results = model.fit(lag_order)
    print(f"  [VAR] Fitted with Lag Order: {lag_order}")
    
    # 2. Train Residuals
    train_pred = var_results.fittedvalues
    train_actual = train_df.iloc[lag_order:]
    train_residuals = train_actual - train_pred

    # 3. Test Baseline (Rolling Forecast)
    coefs = var_results.coefs
    intercept = var_results.intercept
    
    history = pd.concat([train_df.iloc[-lag_order:], test_df])
    history_values = history.values
    
    test_preds = []
    
    for i in range(lag_order, len(history_values)):
        window = history_values[i-lag_order : i]
        window_reversed = window[::-1]
        
        pred = intercept.copy()
        for l in range(lag_order):
            pred += np.dot(coefs[l], window_reversed[l])
            
        test_preds.append(pred)
        
    test_pred_df = pd.DataFrame(test_preds, index=test_df.index, columns=test_df.columns)
    
    # 4. Test Residuals
    test_residuals = test_df - test_pred_df
    
    return train_residuals, test_residuals, test_pred_df

# ==========================================
# 3. GRU -> TFT Model Architecture
# ==========================================
def build_tft_hybrid_model(input_shape, output_dim, config):
    inputs = Input(shape=input_shape)
    
    # --- LAYER 1: GRU (Sequence Processing) ---
    # return_sequences=True is critical for Attention to work
    x = GRU(config['gru_units'], return_sequences=True, activation='tanh', name="GRU_Seq")(inputs)
    x = Dropout(config['dropout'])(x)
    
    # --- LAYER 2: TFT / Attention Block ---
    # Self-Attention on the GRU output
    attn_out = MultiHeadAttention(
        num_heads=config['num_heads'], 
        key_dim=config['head_size'], 
        name="TFT_Attention"
    )(x, x)
    
    # --- LAYER 3: Residual Connection & Norm ---
    # Combine Sequential (GRU) + Global Context (Attention)
    x = Add(name="Skip_Connection")([x, attn_out])
    x = LayerNormalization(epsilon=1e-6, name="TFT_Norm")(x)
    
    # --- LAYER 4: Decoding ---
    x = GlobalAveragePooling1D()(x) # Flatten time dimension
    x = Dropout(config['dropout'])(x)
    
    # Final Linear Projection
    outputs = Dense(output_dim, activation='linear', name="Residual_Output")(x)
    
    model = models.Model(inputs=inputs, outputs=outputs, name="VAR_GRU_TFT")
    
    optimizer = optimizers.Adam(learning_rate=config['learning_rate'])
    # Using Huber loss (VRT style) for robustness against bursts
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    
    return model

# ==========================================
# 4. Helpers
# ==========================================
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

# ==========================================
# 5. Main Execution Pipeline
# ==========================================
def train_evaluate_slice(slice_name, df):
    print(f"\n{'='*40}\n PROCESSING: {slice_name}\n{'='*40}")
    
    # A. Split Data
    split_idx = int(len(df) * CONFIG['train_split'])
    train_raw = df.iloc[:split_idx]
    test_raw = df.iloc[split_idx:]
    
    if len(test_raw) < CONFIG['window_size'] + 50:
        print("  [SKIP] Not enough data for testing.")
        return None

    # B. The Linear Baseline (VAR)
    print("  [1/4] Calculating VAR Residuals...")
    try:
        train_res, test_res, test_var_baseline = get_var_residuals(
            train_raw, test_raw, CONFIG['var_max_lags']
        )
    except Exception as e:
        print(f"  [VAR FAILED] {e}. Skipping slice.")
        return None
    
    # C. Preprocessing for Deep Learning
    print("  [2/4] Scaling & Windowing...")
    scaler = StandardScaler()
    train_res_scaled = scaler.fit_transform(train_res)
    test_res_scaled = scaler.transform(test_res)
    
    X_train, y_train = create_windows(train_res_scaled, CONFIG['window_size'])
    X_test, y_test = create_windows(test_res_scaled, CONFIG['window_size'])
    
    if len(X_train) == 0: return None

    # D. Train GRU-TFT Model
    print(f"  [3/4] Training GRU-TFT ({len(X_train)} samples)...")
    model = build_tft_hybrid_model(
        input_shape=(CONFIG['window_size'], X_train.shape[2]),
        output_dim=y_train.shape[1],
        config=CONFIG
    )
    
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )
    print(f"      > Final Val Loss: {history.history['val_loss'][-1]:.5f}")
    
    # E. Final Prediction & Fusion
    print("  [4/4] Forecasting & Fusion...")
    
    # 1. Predict Scaled Residuals
    pred_res_scaled = model.predict(X_test, verbose=0)
    
    # 2. Inverse Scale -> Real Residuals
    pred_res = scaler.inverse_transform(pred_res_scaled)
    
    # 3. Align Baseline
    baseline_aligned = test_var_baseline.iloc[CONFIG['window_size']:].values
    y_true_aligned = test_raw.iloc[CONFIG['window_size']:].values
    
    # Truncate to matching lengths
    min_len = min(len(baseline_aligned), len(pred_res))
    baseline_aligned = baseline_aligned[:min_len]
    y_true_aligned = y_true_aligned[:min_len]
    pred_res = pred_res[:min_len]
    
    # 4. FUSION
    final_forecast = baseline_aligned + pred_res
    final_forecast = np.maximum(final_forecast, 0)
    
    # Metrics
    mse = np.mean((y_true_aligned - final_forecast)**2, axis=0)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true_aligned - final_forecast), axis=0)
    
    # Plotting
    plt.figure(figsize=(12, 4))
    idx = 0 # Throughput
    plt.plot(y_true_aligned[:, idx], label='Actual', color='black', alpha=0.6)
    plt.plot(baseline_aligned[:, idx], label='VAR Only', color='orange', linestyle='--', alpha=0.7)
    plt.plot(final_forecast[:, idx], label='VAR+GRU+TFT (Final)', color='blue', linewidth=1.5)
    plt.title(f"Slice: {slice_name} | RMSE: {rmse[idx]:.2f}")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'slice': slice_name,
        'rmse_throughput': rmse[0],
        'mae_throughput': mae[0]
    }

def main():
    try:
        # Auto-detect file
        filepath = find_dataset_file()
        CONFIG['dataset_path'] = filepath
        
        slices_data = load_and_engineer_features(filepath)
        
        results = []
        for name, data in slices_data.items():
            res = train_evaluate_slice(name, data)
            if res: results.append(res)
            
        if results:
            res_df = pd.DataFrame(results)
            print("\nFinal Results Summary (VAR + GRU + TFT):")
            print(res_df)
    except Exception as e:
        print(f"Critical Error: {e}")

if __name__ == "__main__":
    main()

Physical devices cannot be modified after being initialized
[SEARCH] Looking for dataset in /kaggle/input...
[FOUND] Dataset located: /kaggle/input/feature-extracted/part-00000.csv

[IO] Loading raw data: /kaggle/input/feature-extracted/part-00000.csv
  > Processing Slice: MMTC
  > Processing Slice: Naver
  > Processing Slice: Youtube

 PROCESSING: MMTC
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 10
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TFT (1379 samples)...
      > Final Val Loss: 0.14843
  [4/4] Forecasting & Fusion...

 PROCESSING: Naver
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TFT (2765 samples)...
      > Final Val Loss: 0.19390
  [4/4] Forecasting & Fusion...

 PROCESSING: Youtube
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TFT (10483 samples)...
      > Final Val Loss: 0.46931
  [4/4] Forecasting & Fusion...

Final Results Summary (VAR + GRU + TFT):
     slice  rmse_throughput  mae_throughput
0     MMTC      5894.143230     1154.303040
1    Naver    916700.750547   682464.718177
2  Youtube    956520.490395   775075.237329


## VAR+GRU+TCN

In [22]:
"""
Hybrid VAR-GRU-TCN (Temporal Convolutional Network) - KAGGLE VERSION
--------------------------------------------------------------------
Architecture:
1. Linear Stream: VAR (Vector Autoregression) for baseline trend.
2. Non-Linear Stream: GRU + Causal TCN for residual correction.
3. Fusion: Forecast = VAR_Baseline + (GRU+TCN)_Correction.
"""

import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras.layers import Input, GRU, Dense, Dropout, Conv1D, LayerNormalization, Add, GlobalAveragePooling1D
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.api import VAR

# ==========================================
# 0. Kaggle Configuration & Setup
# ==========================================
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU Memory Growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("[GPU] Acceleration Enabled")
    except RuntimeError as e:
        print(e)

CONFIG = {
    # Path will be auto-detected
    'dataset_path': None, 
    
    # Data params
    'train_split': 0.7,
    'window_size': 24,       
    'forecast_horizon': 1,   
    
    # VAR params
    'var_max_lags': 15,
    
    # Model Hyperparameters (GRU + TCN)
    'gru_units': 64,         
    'tcn_filters': 64,       # Must match GRU units for Add() layer
    'tcn_kernel': 3,         # Convolution window size
    'dropout': 0.15,
    'learning_rate': 1e-3,
    'batch_size': 32,
    'epochs': 150
}

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ==========================================
# 1. Advanced Data Loading & Engineering
# ==========================================
def find_dataset_file():
    """Auto-detects the dataset file in Kaggle input directory"""
    search_path = '/kaggle/input'
    print(f"[SEARCH] Looking for dataset in {search_path}...")
    
    for root, dirs, files in os.walk(search_path):
        for file in files:
            if file.endswith(".csv") or file.endswith(".txt") or file.endswith(".dat"):
                full_path = os.path.join(root, file)
                print(f"[FOUND] Dataset located: {full_path}")
                return full_path
    
    raise FileNotFoundError("Could not find a dataset file in /kaggle/input")

def load_and_engineer_features(filepath):
    print(f"\n[IO] Loading raw data: {filepath}")
    
    try:
        # Try reading as standard CSV first
        try:
            df = pd.read_csv(filepath)
            if df.shape[1] < 2 or '::' in str(df.iloc[0,0]):
                raise ValueError("Likely raw format")
        except:
            print("  > Detected raw format (parsing '::')...")
            df = pd.read_csv(filepath, sep='\t', header=None, names=['slice_timestamp', 'bytes', 'packets'])
            
            if df.shape[1] == 1:
                 df = pd.read_csv(filepath, sep=',', header=None, names=['slice_timestamp', 'bytes', 'packets'])

            split_data = df['slice_timestamp'].str.split('::', expand=True)
            df['slice_type'] = split_data[0]
            df['timestamp'] = pd.to_numeric(split_data[1])
            df['bytes'] = pd.to_numeric(df['bytes'], errors='coerce')
            df['packets'] = pd.to_numeric(df['packets'], errors='coerce')

        if 'slice_label' in df.columns: df.rename(columns={'slice_label': 'slice_type'}, inplace=True)
        
        processed_slices = {}
        
        group_col = 'slice_type' if 'slice_type' in df.columns else 'slice_id'
        if group_col not in df.columns:
            df['slice_type'] = 'Default_Slice'
            group_col = 'slice_type'

        for slice_id in df[group_col].unique():
            print(f"  > Processing Slice: {slice_id}")
            slice_df = df[df[group_col] == slice_id].sort_values('timestamp').copy()
            
            if len(slice_df) < 500: continue 

            # 1. Map Columns
            if 'sum_bytes' in slice_df.columns: slice_df['throughput'] = slice_df['sum_bytes']
            elif 'bytes' in slice_df.columns: slice_df['throughput'] = slice_df['bytes']
            
            if 'sum_packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['sum_packets']
            elif 'packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['packets']
            
            # 2. Engineering: Velocity (Diff) & Volatility (Std)
            slice_df['throughput_diff'] = slice_df['throughput'].diff()
            slice_df['packet_diff'] = slice_df['packet_rate'].diff()
            slice_df['volatility'] = slice_df['throughput'].rolling(5).std().fillna(0)
            
            # 3. Select Features
            cols = ['throughput', 'packet_rate', 'throughput_diff', 'packet_diff', 'volatility']
            final_df = slice_df[cols].dropna()
            
            # 4. Clip Outliers
            p99 = final_df.quantile(0.99)
            final_df = final_df.clip(upper=p99, axis=1)
            
            processed_slices[slice_id] = final_df
            
        return processed_slices

    except Exception as e:
        print(f"[ERROR] Loading failed: {e}")
        return {}

# ==========================================
# 2. Strict Leakage-Free VAR Baseline
# ==========================================
def get_var_residuals(train_df, test_df, maxlags):
    # Noise injection for constant columns
    for col in train_df.columns:
        if train_df[col].nunique() <= 1:
            train_df[col] += np.random.normal(0, 1e-6, size=len(train_df))

    # 1. Fit VAR on Train
    model = VAR(train_df)
    try:
        lag_order_res = model.select_order(maxlags=maxlags)
        lag_order = lag_order_res.aic
        if lag_order < 1: lag_order = 10
    except:
        lag_order = 10
        
    var_results = model.fit(lag_order)
    print(f"  [VAR] Fitted with Lag Order: {lag_order}")
    
    # 2. Train Residuals
    train_pred = var_results.fittedvalues
    train_actual = train_df.iloc[lag_order:]
    train_residuals = train_actual - train_pred

    # 3. Test Baseline (Rolling Forecast)
    coefs = var_results.coefs
    intercept = var_results.intercept
    
    history = pd.concat([train_df.iloc[-lag_order:], test_df])
    history_values = history.values
    
    test_preds = []
    
    for i in range(lag_order, len(history_values)):
        window = history_values[i-lag_order : i]
        window_reversed = window[::-1]
        
        pred = intercept.copy()
        for l in range(lag_order):
            pred += np.dot(coefs[l], window_reversed[l])
            
        test_preds.append(pred)
        
    test_pred_df = pd.DataFrame(test_preds, index=test_df.index, columns=test_df.columns)
    
    # 4. Test Residuals
    test_residuals = test_df - test_pred_df
    
    return train_residuals, test_residuals, test_pred_df

# ==========================================
# 3. GRU -> TCN Model Architecture
# ==========================================
def build_tcn_hybrid_model(input_shape, output_dim, config):
    inputs = Input(shape=input_shape)
    
    # --- LAYER 1: GRU (Sequence Processing) ---
    # return_sequences=True allows TCN to see the full time history
    x = GRU(config['gru_units'], return_sequences=True, activation='tanh', name="GRU_Seq")(inputs)
    x = Dropout(config['dropout'])(x)
    
    # --- LAYER 2: TCN (Causal Convolution) ---
    # 1st Conv Block - Extracts local bursts
    tcn_out = Conv1D(filters=config['tcn_filters'], 
                     kernel_size=config['tcn_kernel'], 
                     padding='causal', 
                     activation='relu', 
                     name="TCN_Block_1")(x)
    
    # 2nd Conv Block - Refines patterns
    tcn_out = Conv1D(filters=config['tcn_filters'], 
                     kernel_size=config['tcn_kernel'], 
                     padding='causal', 
                     activation='relu', 
                     name="TCN_Block_2")(tcn_out)
    
    # --- LAYER 3: Residual Connection ---
    # Combine Time Dynamics (GRU) + Local Patterns (TCN)
    x = Add(name="Skip_Connection")([x, tcn_out])
    x = LayerNormalization(epsilon=1e-6, name="TCN_Norm")(x)
    
    # --- LAYER 4: Decoding ---
    x = GlobalAveragePooling1D()(x) # Flatten time dimension
    x = Dense(32, activation='relu')(x)
    x = Dropout(config['dropout'])(x)
    
    # Final Linear Projection
    outputs = Dense(output_dim, activation='linear', name="Residual_Output")(x)
    
    model = models.Model(inputs=inputs, outputs=outputs, name="VAR_GRU_TCN")
    
    optimizer = optimizers.Adam(learning_rate=config['learning_rate'])
    # Huber Loss for robustness
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    
    return model

# ==========================================
# 4. Helpers
# ==========================================
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

# ==========================================
# 5. Main Execution Pipeline
# ==========================================
def train_evaluate_slice(slice_name, df):
    print(f"\n{'='*40}\n PROCESSING: {slice_name}\n{'='*40}")
    
    # A. Split Data
    split_idx = int(len(df) * CONFIG['train_split'])
    train_raw = df.iloc[:split_idx]
    test_raw = df.iloc[split_idx:]
    
    if len(test_raw) < CONFIG['window_size'] + 50:
        print("  [SKIP] Not enough data for testing.")
        return None

    # B. The Linear Baseline (VAR)
    print("  [1/4] Calculating VAR Residuals...")
    try:
        train_res, test_res, test_var_baseline = get_var_residuals(
            train_raw, test_raw, CONFIG['var_max_lags']
        )
    except Exception as e:
        print(f"  [VAR FAILED] {e}. Skipping slice.")
        return None
    
    # C. Preprocessing for Deep Learning
    print("  [2/4] Scaling & Windowing...")
    scaler = StandardScaler()
    train_res_scaled = scaler.fit_transform(train_res)
    test_res_scaled = scaler.transform(test_res)
    
    X_train, y_train = create_windows(train_res_scaled, CONFIG['window_size'])
    X_test, y_test = create_windows(test_res_scaled, CONFIG['window_size'])
    
    if len(X_train) == 0: return None

    # D. Train GRU-TCN Model
    print(f"  [3/4] Training GRU-TCN ({len(X_train)} samples)...")
    model = build_tcn_hybrid_model(
        input_shape=(CONFIG['window_size'], X_train.shape[2]),
        output_dim=y_train.shape[1],
        config=CONFIG
    )
    
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )
    print(f"      > Final Val Loss: {history.history['val_loss'][-1]:.5f}")
    
    # E. Final Prediction & Fusion
    print("  [4/4] Forecasting & Fusion...")
    
    # 1. Predict Scaled Residuals
    pred_res_scaled = model.predict(X_test, verbose=0)
    
    # 2. Inverse Scale -> Real Residuals
    pred_res = scaler.inverse_transform(pred_res_scaled)
    
    # 3. Align Baseline
    baseline_aligned = test_var_baseline.iloc[CONFIG['window_size']:].values
    y_true_aligned = test_raw.iloc[CONFIG['window_size']:].values
    
    # Truncate to matching lengths
    min_len = min(len(baseline_aligned), len(pred_res))
    baseline_aligned = baseline_aligned[:min_len]
    y_true_aligned = y_true_aligned[:min_len]
    pred_res = pred_res[:min_len]
    
    # 4. FUSION
    final_forecast = baseline_aligned + pred_res
    final_forecast = np.maximum(final_forecast, 0)
    
    # Metrics
    mse = np.mean((y_true_aligned - final_forecast)**2, axis=0)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true_aligned - final_forecast), axis=0)
    
    # Plotting
    plt.figure(figsize=(12, 4))
    idx = 0 # Throughput
    plt.plot(y_true_aligned[:, idx], label='Actual', color='black', alpha=0.6)
    plt.plot(baseline_aligned[:, idx], label='VAR Only', color='orange', linestyle='--', alpha=0.7)
    plt.plot(final_forecast[:, idx], label='VAR+GRU+TCN (Final)', color='red', linewidth=1.5)
    plt.title(f"Slice: {slice_name} | RMSE: {rmse[idx]:.2f}")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'slice': slice_name,
        'rmse_throughput': rmse[0],
        'mae_throughput': mae[0]
    }

def main():
    try:
        # Auto-detect file
        filepath = find_dataset_file()
        CONFIG['dataset_path'] = filepath
        
        slices_data = load_and_engineer_features(filepath)
        
        results = []
        for name, data in slices_data.items():
            res = train_evaluate_slice(name, data)
            if res: results.append(res)
            
        if results:
            res_df = pd.DataFrame(results)
            print("\nFinal Results Summary (VAR + GRU + TCN):")
            print(res_df)
    except Exception as e:
        print(f"Critical Error: {e}")

if __name__ == "__main__":
    main()

Physical devices cannot be modified after being initialized
[SEARCH] Looking for dataset in /kaggle/input...
[FOUND] Dataset located: /kaggle/input/feature-extracted/part-00000.csv

[IO] Loading raw data: /kaggle/input/feature-extracted/part-00000.csv
  > Processing Slice: MMTC
  > Processing Slice: Naver
  > Processing Slice: Youtube

 PROCESSING: MMTC
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 10
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TCN (1379 samples)...
      > Final Val Loss: 0.15890
  [4/4] Forecasting & Fusion...

 PROCESSING: Naver
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TCN (2765 samples)...
      > Final Val Loss: 0.21138
  [4/4] Forecasting & Fusion...

 PROCESSING: Youtube
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training GRU-TCN (10483 samples)...
      > Final Val Loss: 0.48476
  [4/4] Forecasting & Fusion...


  plt.figure(figsize=(12, 4))



Final Results Summary (VAR + GRU + TCN):
     slice  rmse_throughput  mae_throughput
0     MMTC      6063.935700     1112.649430
1    Naver    897424.585549   653295.107602
2  Youtube    934596.363458   761275.439305


## VRT - VAR-Residual-Transformer

In [19]:
"""
VAR-Residual-Transformer (VRT) for 5G Slice Forecasting - KAGGLE VERSION
------------------------------------------------------------------------
Architecture:
1. Linear Stream: VAR (Vector Autoregression) for baseline trend.
2. Non-Linear Stream: Transformer Encoder for residual (error) correction.
3. Fusion: Forecast = VAR_Baseline + Transformer_Correction.
"""

import os
import glob
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.api import VAR

# ==========================================
# 0. Kaggle Configuration & Setup
# ==========================================
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

# GPU Memory Growth (Important for Kaggle Shared GPUs)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("[GPU] Acceleration Enabled")
    except RuntimeError as e:
        print(e)

CONFIG = {
    # Path will be auto-detected
    'dataset_path': None, 
    
    # Data params
    'train_split': 0.7,
    'window_size': 24,       
    'forecast_horizon': 1,   
    
    # VAR params
    'var_max_lags': 15,
    
    # Transformer params
    'head_size': 64,         
    'num_heads': 4,          
    'ff_dim': 128,           
    'num_transformer_blocks': 3,
    'dropout': 0.15,
    'learning_rate': 1e-3,
    'batch_size': 32,
    'epochs': 150
}

# Reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ==========================================
# 1. Advanced Data Loading & Engineering
# ==========================================
def find_dataset_file():
    """Auto-detects the dataset file in Kaggle input directory"""
    search_path = '/kaggle/input'
    print(f"[SEARCH] Looking for dataset in {search_path}...")
    
    # Recursively find any CSV or TXT file
    for root, dirs, files in os.walk(search_path):
        for file in files:
            if file.endswith(".csv") or file.endswith(".txt") or file.endswith(".dat"):
                full_path = os.path.join(root, file)
                print(f"[FOUND] Dataset located: {full_path}")
                return full_path
    
    raise FileNotFoundError("Could not find a dataset file in /kaggle/input")

def load_and_engineer_features(filepath):
    print(f"\n[IO] Loading raw data: {filepath}")
    
    try:
        # Try reading as standard CSV first (in case you uploaded a clean version)
        try:
            df = pd.read_csv(filepath)
            # If it's the raw format, the first column usually contains '::'
            if df.shape[1] < 2 or '::' in str(df.iloc[0,0]):
                raise ValueError("Likely raw format")
        except:
            # Fallback to the specific parsing logic provided
            print("  > Detected raw format (parsing '::')...")
            df = pd.read_csv(filepath, sep='\t', header=None, names=['slice_timestamp', 'bytes', 'packets'])
            
            # Handle potential single column issues if sep isn't tab
            if df.shape[1] == 1:
                 df = pd.read_csv(filepath, sep=',', header=None, names=['slice_timestamp', 'bytes', 'packets'])

            split_data = df['slice_timestamp'].str.split('::', expand=True)
            df['slice_type'] = split_data[0]
            df['timestamp'] = pd.to_numeric(split_data[1])
            df['bytes'] = pd.to_numeric(df['bytes'], errors='coerce')
            df['packets'] = pd.to_numeric(df['packets'], errors='coerce')

        # Standardize column names if reading a clean CSV
        if 'slice_label' in df.columns: df.rename(columns={'slice_label': 'slice_type'}, inplace=True)
        
        processed_slices = {}
        
        # Determine the grouping column
        group_col = 'slice_type' if 'slice_type' in df.columns else 'slice_id'
        if group_col not in df.columns:
            # If no slice column, assume whole file is one slice
            df['slice_type'] = 'Default_Slice'
            group_col = 'slice_type'

        for slice_id in df[group_col].unique():
            print(f"  > Processing Slice: {slice_id}")
            slice_df = df[df[group_col] == slice_id].sort_values('timestamp').copy()
            
            if len(slice_df) < 500: continue 

            # 2. Key Performance Indicators (KPIs)
            # Map columns flexibly
            if 'sum_bytes' in slice_df.columns: slice_df['throughput'] = slice_df['sum_bytes']
            elif 'bytes' in slice_df.columns: slice_df['throughput'] = slice_df['bytes']
            
            if 'sum_packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['sum_packets']
            elif 'packets' in slice_df.columns: slice_df['packet_rate'] = slice_df['packets']
            
            # 3. Velocity/Acceleration
            slice_df['throughput_diff'] = slice_df['throughput'].diff()
            slice_df['packet_diff'] = slice_df['packet_rate'].diff()
            
            # 4. Volatility
            slice_df['volatility'] = slice_df['throughput'].rolling(5).std().fillna(0)
            
            # Features
            cols = ['throughput', 'packet_rate', 'throughput_diff', 'packet_diff', 'volatility']
            final_df = slice_df[cols].dropna()
            
            # Robustness
            p99 = final_df.quantile(0.99)
            final_df = final_df.clip(upper=p99, axis=1)
            
            processed_slices[slice_id] = final_df
            
        return processed_slices

    except Exception as e:
        print(f"[ERROR] Loading failed: {e}")
        import traceback
        traceback.print_exc()
        return {}

# ==========================================
# 2. Strict Leakage-Free VAR Baseline
# ==========================================
def get_var_residuals(train_df, test_df, maxlags):
    # Handle constant columns to prevent VAR crash
    for col in train_df.columns:
        if train_df[col].nunique() <= 1:
            train_df[col] += np.random.normal(0, 1e-6, size=len(train_df))

    # 1. Fit VAR on Train
    model = VAR(train_df)
    try:
        lag_order_res = model.select_order(maxlags=maxlags)
        lag_order = lag_order_res.aic
        if lag_order < 1: lag_order = 10
    except:
        lag_order = 10
        
    var_results = model.fit(lag_order)
    print(f"  [VAR] Fitted with Lag Order: {lag_order}")
    
    # 2. Train Residuals
    train_pred = var_results.fittedvalues
    train_actual = train_df.iloc[lag_order:]
    train_residuals = train_actual - train_pred

    # 3. Test Baseline (Rolling Forecast)
    coefs = var_results.coefs
    intercept = var_results.intercept
    
    history = pd.concat([train_df.iloc[-lag_order:], test_df])
    history_values = history.values
    
    test_preds = []
    
    for i in range(lag_order, len(history_values)):
        window = history_values[i-lag_order : i]
        window_reversed = window[::-1]
        
        pred = intercept.copy()
        for l in range(lag_order):
            pred += np.dot(coefs[l], window_reversed[l])
            
        test_preds.append(pred)
        
    test_pred_df = pd.DataFrame(test_preds, index=test_df.index, columns=test_df.columns)
    
    # 4. Test Residuals
    test_residuals = test_df - test_pred_df
    
    return train_residuals, test_residuals, test_pred_df

# ==========================================
# 3. Transformer Model Components
# ==========================================
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_transformer_model(input_shape, output_dim, config):
    inputs = layers.Input(shape=input_shape)
    
    # 1. Learnable Positional Embedding
    x = layers.Conv1D(filters=config['head_size'], kernel_size=1)(inputs)
    
    # 2. Transformer Blocks
    for _ in range(config['num_transformer_blocks']):
        x = TransformerBlock(
            embed_dim=config['head_size'],
            num_heads=config['num_heads'],
            ff_dim=config['ff_dim'],
            dropout=config['dropout']
        )(x)
        
    # 3. Global Pooling
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(config['dropout'])(x)
    
    # 4. Output
    outputs = layers.Dense(output_dim, activation='linear')(x)
    
    model = models.Model(inputs=inputs, outputs=outputs, name="Residual_Transformer")
    
    optimizer = optimizers.Adam(learning_rate=config['learning_rate'])
    model.compile(optimizer=optimizer, loss=tf.keras.losses.Huber(), metrics=['mae'])
    
    return model

# ==========================================
# 4. Helpers: Windowing
# ==========================================
def create_windows(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i : i + window_size])
        y.append(data[i + window_size])
    return np.array(X), np.array(y)

# ==========================================
# 5. Main Execution Pipeline
# ==========================================
def train_evaluate_slice(slice_name, df):
    print(f"\n{'='*40}\n PROCESSING: {slice_name}\n{'='*40}")
    
    # A. Split Data
    split_idx = int(len(df) * CONFIG['train_split'])
    train_raw = df.iloc[:split_idx]
    test_raw = df.iloc[split_idx:]
    
    if len(test_raw) < CONFIG['window_size'] + 50:
        print("  [SKIP] Not enough data for testing.")
        return None

    # B. The Linear Baseline (VAR)
    print("  [1/4] Calculating VAR Residuals...")
    try:
        train_res, test_res, test_var_baseline = get_var_residuals(
            train_raw, test_raw, CONFIG['var_max_lags']
        )
    except Exception as e:
        print(f"  [VAR FAILED] {e}. Skipping slice.")
        return None
    
    # C. Preprocessing for Deep Learning
    print("  [2/4] Scaling & Windowing...")
    scaler = StandardScaler()
    train_res_scaled = scaler.fit_transform(train_res)
    test_res_scaled = scaler.transform(test_res)
    
    X_train, y_train = create_windows(train_res_scaled, CONFIG['window_size'])
    X_test, y_test = create_windows(test_res_scaled, CONFIG['window_size'])
    
    if len(X_train) == 0: return None

    # D. Train Transformer
    print(f"  [3/4] Training Transformer ({len(X_train)} samples)...")
    model = build_transformer_model(
        input_shape=(CONFIG['window_size'], X_train.shape[2]),
        output_dim=y_train.shape[1],
        config=CONFIG
    )
    
    early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
    
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        callbacks=[early_stop, reduce_lr],
        verbose=0
    )
    print(f"      > Final Val Loss: {history.history['val_loss'][-1]:.5f}")
    
    # E. Final Prediction & Fusion
    print("  [4/4] Forecasting & Fusion...")
    
    # 1. Predict Scaled Residuals
    pred_res_scaled = model.predict(X_test, verbose=0)
    
    # 2. Inverse Scale -> Real Residuals
    pred_res = scaler.inverse_transform(pred_res_scaled)
    
    # 3. Align Baseline
    baseline_aligned = test_var_baseline.iloc[CONFIG['window_size']:].values
    y_true_aligned = test_raw.iloc[CONFIG['window_size']:].values
    
    # Truncate to matching lengths
    min_len = min(len(baseline_aligned), len(pred_res))
    baseline_aligned = baseline_aligned[:min_len]
    y_true_aligned = y_true_aligned[:min_len]
    pred_res = pred_res[:min_len]
    
    # 4. FUSION
    final_forecast = baseline_aligned + pred_res
    final_forecast = np.maximum(final_forecast, 0)
    
    # Metrics
    mse = np.mean((y_true_aligned - final_forecast)**2, axis=0)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true_aligned - final_forecast), axis=0)
    
    # Plotting
    plt.figure(figsize=(12, 4))
    idx = 0 # Throughput
    plt.plot(y_true_aligned[:, idx], label='Actual', color='black', alpha=0.6)
    plt.plot(baseline_aligned[:, idx], label='VAR Only', color='orange', linestyle='--', alpha=0.7)
    plt.plot(final_forecast[:, idx], label='VRT (Final)', color='blue', linewidth=1.5)
    plt.title(f"Slice: {slice_name} | RMSE: {rmse[idx]:.2f}")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    return {
        'slice': slice_name,
        'rmse_throughput': rmse[0],
        'mae_throughput': mae[0]
    }

def main():
    try:
        # Auto-detect file
        filepath = find_dataset_file()
        CONFIG['dataset_path'] = filepath
        
        slices_data = load_and_engineer_features(filepath)
        
        results = []
        for name, data in slices_data.items():
            res = train_evaluate_slice(name, data)
            if res: results.append(res)
            
        if results:
            res_df = pd.DataFrame(results)
            print("\nFinal Results Summary:")
            print(res_df)
    except Exception as e:
        print(f"Critical Error: {e}")

if __name__ == "__main__":
    main()

Physical devices cannot be modified after being initialized
[SEARCH] Looking for dataset in /kaggle/input...
[FOUND] Dataset located: /kaggle/input/feature-extracted/part-00000.csv

[IO] Loading raw data: /kaggle/input/feature-extracted/part-00000.csv
  > Processing Slice: MMTC
  > Processing Slice: Naver
  > Processing Slice: Youtube

 PROCESSING: MMTC
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 10
  [2/4] Scaling & Windowing...
  [3/4] Training Transformer (1379 samples)...


I0000 00:00:1767630821.962506     132 service.cc:152] XLA service 0x795f581c67c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1767630821.962557     132 service.cc:160]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1767630821.962561     132 service.cc:160]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1767630829.592317     132 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


      > Final Val Loss: 0.13456
  [4/4] Forecasting & Fusion...

 PROCESSING: Naver
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training Transformer (2765 samples)...
      > Final Val Loss: 0.26480
  [4/4] Forecasting & Fusion...

 PROCESSING: Youtube
  [1/4] Calculating VAR Residuals...


  self._init_dates(dates, freq)


  [VAR] Fitted with Lag Order: 15
  [2/4] Scaling & Windowing...
  [3/4] Training Transformer (10483 samples)...
      > Final Val Loss: 0.48421
  [4/4] Forecasting & Fusion...

Final Results Summary:
     slice  rmse_throughput  mae_throughput
0     MMTC      5935.675836     1170.855953
1    Naver    901047.049856   657102.529425
2  Youtube    933798.719192   762432.662301
