In [3]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../datasets/VeReMi_Extension/mixalldata_clean.csv")

# Sort by sender and timestamp
df.sort_values(["sender", "messageID"], inplace=True)

In [4]:
from sklearn.preprocessing import StandardScaler

# Define features
features = ['posx', 
            'posy', 
            'posx_n', 
            'spdx', 
            'spdy', 
            'spdx_n',
            'spdy_n', 
            'aclx', 
            'acly', 
            'aclx_n', 
            'acly_n', 
            'hedx', 
            'hedy', 
            'hedx_n',
            'hedy_n'] 

label_col = "class"

# Normalize features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Encode labels
df[label_col] = df[label_col].astype("category").cat.codes 

# Group by sender_id
grouped = df.groupby("sender")


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [18]:
sequence_length = 10  # Define time window

def create_time_series(group):
    X, y = [], []
    group = group[features + [label_col]].values  # Convert to NumPy array

    if len(group) < sequence_length:
        return None  # Skip senders with insufficient data

    # Create sliding window sequences
    for i in range(len(group) - sequence_length):
        X.append(group[i:i+sequence_length, :-1])  # Features
        y.append(group[i+sequence_length, -1])  # Label

    X, y = np.array(X), np.array(y)

    # Ensure correct dimensions: X should be 3D, y should be 1D
    if X.ndim != 3 or y.ndim != 1:
        return None  # Skip malformed sequences

    return X, y

# Apply function to all sender groups
X_y_pairs = [create_time_series(group) for _, group in grouped]

# Remove None values (senders with insufficient data)
X_y_pairs = [pair for pair in X_y_pairs if pair is not None]

# Ensure valid unpacking
if len(X_y_pairs) > 0:
    X_list, y_list = zip(*X_y_pairs)  # Unpack
    X = np.concatenate(X_list, axis=0)  # Convert to final shape
    y = np.concatenate(y_list, axis=0)
else:
    raise ValueError("No valid sequences found! Reduce `sequence_length`.")

from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=len(np.unique(y)))
    
print("Shape of X:", X.shape)  # (samples, time steps, features)
print("Shape of y:", y.shape)  # (samples,)

Shape of X: (2949997, 10, 15)
Shape of y: (2949997, 20)


In [19]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Bidirectional(LSTM(64, activation='tanh',return_sequences=True,input_shape=(sequence_length, len(features)))))
model.add(Dropout(0.2))
model.add(Dropout(0.2))
model.add(Dense(24,activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dense(20,activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [23]:
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
model.save("../models/centralized_veremi_extension.keras")