# Implementing the model based in the paper \[[1](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10404993&casa_token=bdWYvtDCImwAAAAA:dEnW6U7EsqgOzYXmVphWYMTQ7XySynoAB-mU9Hq0NVxpyvBfB_ML1mlJEmmeIfK2b59TxZsoxg&tag=1)\]

[1] [Zhong, Yongchao, et al. "Sybil Attack Detection in VANETs: An LSTM-Based BiGAN Approach." 2023 International Conference on Data Security and Privacy Protection (DSPP). IEEE, 2023.](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=10404993&casa_token=bdWYvtDCImwAAAAA:dEnW6U7EsqgOzYXmVphWYMTQ7XySynoAB-mU9Hq0NVxpyvBfB_ML1mlJEmmeIfK2b59TxZsoxg&tag=1)

In [2]:
import tensorflow as tf
import pandas as pd 
import numpy as np
import seaborn as sns
import scipy.io as scio

from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential

from sys import path 
path.append("../utils")
from dataset_operations import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

2024-07-17 11:51:46.006073: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-17 11:51:46.161341: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Load Dataset \[[2](https://github.com/stevenso8/WiSec_DataModifiedVeremi_Dataset)\]

[2] [WiSec_DataModifiedVeremi_Dataset](https://github.com/stevenso8/WiSec_DataModifiedVeremi_Dataset)

In [3]:
dataset_1 = scio.loadmat('../datasets/Modified_VeReMi/WiSec_DataModifiedVeremi_Dataset/attack16withlabels.mat')
dataset_2 = scio.loadmat('../datasets/Modified_VeReMi/WiSec_DataModifiedVeremi_Dataset/attack1withlabels.mat')
dataset_3 = scio.loadmat('../datasets/Modified_VeReMi/WiSec_DataModifiedVeremi_Dataset/attack2withlabels.mat')
dataset_4 = scio.loadmat('../datasets/Modified_VeReMi/WiSec_DataModifiedVeremi_Dataset/attack4withlabels.mat')
dataset_5 = scio.loadmat('../datasets/Modified_VeReMi/WiSec_DataModifiedVeremi_Dataset/attack8withlabels.mat')

# Dataset Visualization

In [7]:
header = ["type",
         "timeReceiver",
         "receiverID",
         "receiverXposition",
         "receiverYposition",
         "receiverZposition",
         "timeTransmitted",
         "senderID",
         "messageID",
         "senderXposition",
         "senderYposition",
         "senderZposition",
         "senderXvelocity",
         "senderYvelocity",
         "senderZvelocity",
         "rssi",
         "class"]

df_dataset = pd.concat([pd.DataFrame(dataset_1['attack16withlabels']),
               pd.DataFrame(dataset_2['attack1withlabels']),
               pd.DataFrame(dataset_3['attack2withlabels']),
               pd.DataFrame(dataset_4['attack4withlabels']),
               pd.DataFrame(dataset_5['attack8withlabels'])])

df_dataset.columns = header

# verify the number of missing data
#df_dataset.isna().sum()
df_dataset.info()
#df_dataset.senderID.value_counts()

<class 'pandas.core.frame.DataFrame'>
Index: 2126976 entries, 0 to 424809
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   type               float64
 1   timeReceiver       float64
 2   receiverID         float64
 3   receiverXposition  float64
 4   receiverYposition  float64
 5   receiverZposition  float64
 6   timeTransmitted    float64
 7   senderID           float64
 8   messageID          float64
 9   senderXposition    float64
 10  senderYposition    float64
 11  senderZposition    float64
 12  senderXvelocity    float64
 13  senderYvelocity    float64
 14  senderZvelocity    float64
 15  rssi               float64
 16  class              float64
dtypes: float64(17)
memory usage: 292.1 MB


# Feature Pre-Processing

 1 - Removing identifiers\
 2 - Handling missing values\
 3 - Person correlation\
 4 - Feature normalization (minmax scaler)

### 1 - Removing identifiers

In [None]:
 df_dataset = df_dataset.drop(['receiverID','senderID', 'messageID'], axis=1)

df_dataset

### 2 - Handling missing values

In [None]:
df_dataset = df_dataset.dropna()

df_dataset

### 3 - Pearson Correlation

In [None]:
features_nan_corr = ["receiverZposition",
                     "senderZposition",
                     "type",
                     "senderZvelocity",
                     "timeReceiver"]

df_dataset = df_dataset.drop(columns=features_nan_corr)

In [None]:
plt.figure(figsize=(14,8))
sns.heatmap(df_dataset.corr(), annot=True, cmap='Blues')

### 4 - Feature normalization

In [None]:
X = df_dataset.drop(columns=['class'])

In [None]:
columns_names = X.columns
scaler = MinMaxScaler()
scaler = scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = columns_names
X

# Generating Labels One-Hot Enconding

In [None]:
y = df_dataset['class']
y = pd.get_dummies(y,columns=['class'])
y.shape

# Format data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.8,random_state=42)
x_train = np.resize(x_train,(x_train.shape[0],1,x_train.shape[1]))
x_test = np.resize(x_test,(x_test.shape[0],1,x_test.shape[1]))

# Model Implementation

In [None]:
model = Sequential()
model.add(LSTM(100, activation='tanh',return_sequences=True,input_shape=(None,X.shape[1])))
model.add(LSTM(49,activation='tanh'))
model.add(Dense(6,activation='softmax'))

model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy'])

# Model Training

In [None]:
model.fit(x_train,y_train,epochs=500,batch_size=64)

# Model Test

In [None]:
model.evaluate(x_test,y_test)

# Model Paramters

In [None]:
model.summary()