In [31]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

In [32]:
# Load the dataset from the Parquet file
df = pd.read_parquet("hf://datasets/19kmunz/iot-23-preprocessed/data/train-00000-of-00001-ad1ef30cd88c8d29.parquet")

# Splitting the dataset into train and test sets (80/20 split)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create a directory named 'data' if it doesn't exist
data_folder = 'data'
os.makedirs(data_folder, exist_ok=True)

# Define file paths for train and test CSV files within the 'data' folder
train_csv_path = os.path.join(data_folder, 'train.csv')
test_csv_path = os.path.join(data_folder, 'test.csv')

# Save the train and test sets to CSV files in the 'data' folder
train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

# Calculate sizes
total_size = len(train_df) + len(test_df)
# Print the total size and percentages
print(f"Total data size: {total_size}")
# 80% train, 20% test
print(f"Training data size: {len(train_df)}")
print(f"Testing data size: {len(test_df)}")

train_df['label'].value_counts()

Total data size: 819024
Training data size: 655219
Testing data size: 163805


label
Malicious    434384
Benign       220835
Name: count, dtype: int64

In [33]:
# Prepare data for training
# Define feature columns and target column
numerical_features = ['id.orig_p', 'id.resp_p', 'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes']
categorical_features = ['proto', 'service', 'conn_state', 'history']
target_column = 'label'

In [34]:
train_df = train_df.dropna()
print(train_df.to_string())
print(f"Training data size: {len(train_df)}")

        id.orig_p  id.resp_p proto service      duration  orig_bytes   resp_bytes conn_state  missed_bytes       history  orig_pkts  orig_ip_bytes  resp_pkts  resp_ip_bytes      label
11746       56380       6667   tcp     irc     33.165579        62.0        269.0         S3             0       ShAdDaf          7            434          6            589  Malicious
3355        49286         53   udp     dns      0.000492        45.0         45.0         SF             0            Dd          1             73          1             73     Benign
817344      51998         53   udp     dns      0.121426        34.0        297.0         SF             0            Dd          1             62          1            325     Benign
761959      40936         81   tcp    http      6.084291       184.0        723.0         SF             0      ShADadFf         11            780          6           1043     Benign
191542      56165         53   udp     dns      5.005128        78.0          0.

In [35]:
test_df = test_df.dropna()
print(test_df.to_string())
print(f"Testing data size: {len(test_df)}")

        id.orig_p  id.resp_p proto service     duration  orig_bytes  resp_bytes conn_state  missed_bytes       history  orig_pkts  orig_ip_bytes  resp_pkts  resp_ip_bytes      label
5188        49550       6667   tcp     irc     2.105545        74.0       242.0         S3             0       ShAdDaf          9            558          6            562  Malicious
307114      58999         53   udp     dns     0.000499        29.0        45.0         SF             0            Dd          1             57          1             73     Benign
305897      46143         53   udp     dns     0.000496        29.0        45.0         SF             0            Dd          1             57          1             73     Benign
818128      44635         53   udp     dns     0.010835        37.0        53.0         SF             0            Dd          1             65          1             81     Benign
306307      59264         53   udp     dns     0.003005        29.0        45.0         SF

In [36]:
train_df['label'].value_counts()

label
Benign       4640
Malicious    1430
Name: count, dtype: int64

In [37]:
# Extract features and target
X_train_num = train_df[numerical_features].values
X_train_cat = train_df[categorical_features].values
y_train = train_df[target_column].values
X_test_num = test_df[numerical_features].values
X_test_cat = test_df[categorical_features].values
y_test = test_df[target_column].values

# Print the first few rows of training data
print("Training Data:")
print("Features in num:")
print(X_train_num[:5])  # Print first 5 rows of features
print("Features in cat:")
print(X_train_cat[:5])  # Print first 5 rows of features
print("Labels:")
print(y_train[:5])  # Print first 5 labels

# Print the first few rows of test data
print("\nTest Data:")
print("Features in num:")
print(X_test_num[:5])  # Print first 5 rows of features
print("Features in cat:")
print(X_test_cat[:5])  # Print first 5 rows of features
print("Labels:")
print(y_test[:5])  # Print first 5 labels

Training Data:
Features in num:
[[5.6380000e+04 6.6670000e+03 3.3165579e+01 6.2000000e+01 2.6900000e+02
  0.0000000e+00 7.0000000e+00 4.3400000e+02 6.0000000e+00 5.8900000e+02]
 [4.9286000e+04 5.3000000e+01 4.9200000e-04 4.5000000e+01 4.5000000e+01
  0.0000000e+00 1.0000000e+00 7.3000000e+01 1.0000000e+00 7.3000000e+01]
 [5.1998000e+04 5.3000000e+01 1.2142600e-01 3.4000000e+01 2.9700000e+02
  0.0000000e+00 1.0000000e+00 6.2000000e+01 1.0000000e+00 3.2500000e+02]
 [4.0936000e+04 8.1000000e+01 6.0842910e+00 1.8400000e+02 7.2300000e+02
  0.0000000e+00 1.1000000e+01 7.8000000e+02 6.0000000e+00 1.0430000e+03]
 [5.6165000e+04 5.3000000e+01 5.0051280e+00 7.8000000e+01 0.0000000e+00
  0.0000000e+00 2.0000000e+00 1.3400000e+02 0.0000000e+00 0.0000000e+00]]
Features in cat:
[['tcp' 'irc' 'S3' 'ShAdDaf']
 ['udp' 'dns' 'SF' 'Dd']
 ['udp' 'dns' 'SF' 'Dd']
 ['tcp' 'http' 'SF' 'ShADadFf']
 ['udp' 'dns' 'S0' 'D']]
Labels:
['Malicious' 'Benign' 'Benign' 'Benign' 'Benign']

Test Data:
Features in num:
[

In [38]:
# Encode target labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Print the first few rows of training data
print("Training Data:")
print("Labels:")
print(y_train[:5])  # Print first 5 labels

# Print the first few rows of test data
print("\nTest Data:")
print("Labels:")
print(y_test[:5])  # Print first 5 labels

Training Data:
Labels:
[1 0 0 0 0]

Test Data:
Labels:
[1 0 0 0 0]


In [39]:
# Handle missing values in numerical features
imputer = SimpleImputer(strategy='mean')
X_train_num = imputer.fit_transform(X_train_num)
X_test_num = imputer.transform(X_test_num)

# Print the first few rows of training data
print("Training Data:")
print("Features:")
print(X_train_num[:5])  # Print first 5 rows of features

# Print the first few rows of test data
print("\nTest Data:")
print("Features:")
print(X_test_num[:5])  # Print first 5 rows of features

Training Data:
Features:
[[5.6380000e+04 6.6670000e+03 3.3165579e+01 6.2000000e+01 2.6900000e+02
  0.0000000e+00 7.0000000e+00 4.3400000e+02 6.0000000e+00 5.8900000e+02]
 [4.9286000e+04 5.3000000e+01 4.9200000e-04 4.5000000e+01 4.5000000e+01
  0.0000000e+00 1.0000000e+00 7.3000000e+01 1.0000000e+00 7.3000000e+01]
 [5.1998000e+04 5.3000000e+01 1.2142600e-01 3.4000000e+01 2.9700000e+02
  0.0000000e+00 1.0000000e+00 6.2000000e+01 1.0000000e+00 3.2500000e+02]
 [4.0936000e+04 8.1000000e+01 6.0842910e+00 1.8400000e+02 7.2300000e+02
  0.0000000e+00 1.1000000e+01 7.8000000e+02 6.0000000e+00 1.0430000e+03]
 [5.6165000e+04 5.3000000e+01 5.0051280e+00 7.8000000e+01 0.0000000e+00
  0.0000000e+00 2.0000000e+00 1.3400000e+02 0.0000000e+00 0.0000000e+00]]

Test Data:
Features:
[[4.955000e+04 6.667000e+03 2.105545e+00 7.400000e+01 2.420000e+02
  0.000000e+00 9.000000e+00 5.580000e+02 6.000000e+00 5.620000e+02]
 [5.899900e+04 5.300000e+01 4.990000e-04 2.900000e+01 4.500000e+01
  0.000000e+00 1.000000e+

In [40]:
# Scale numerical features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_test_num = scaler.transform(X_test_num)

# Print the first few rows of training data
print("Training Data:")
print("Features:")
print(X_train_num[:5])  # Print first 5 rows of features

# Print the first few rows of test data
print("\nTest Data:")
print("Features:")
print(X_test_num[:5])  # Print first 5 rows of features

Training Data:
Features:
[[ 0.82484712  1.77135877  0.00995192 -0.04510805 -0.01371861 -0.06725929
  -0.01276814 -0.01639658 -0.01293644 -0.01378705]
 [ 0.272654   -0.57889136 -0.03770718 -0.04751786 -0.01377045 -0.06725929
  -0.01654277 -0.02067212 -0.01456203 -0.01390197]
 [ 0.48375462 -0.57889136 -0.0375334  -0.04907715 -0.01371214 -0.06725929
  -0.01654277 -0.0208024  -0.01456203 -0.01384585]
 [-0.37730548 -0.56894171 -0.0289646  -0.0278141  -0.01361356 -0.06725929
  -0.01025172 -0.01229868 -0.01293644 -0.01368594]
 [ 0.80811164 -0.57889136 -0.03051539 -0.04283999 -0.01378086 -0.06725929
  -0.01591366 -0.01994966 -0.01488715 -0.01391823]]

Test Data:
Features:
[[ 0.29320362  1.77135877 -0.03468217 -0.043407   -0.01372486 -0.06725929
  -0.01150993 -0.01492797 -0.01293644 -0.01379306]
 [ 1.02870866 -0.57889136 -0.03770717 -0.04978592 -0.01377045 -0.06725929
  -0.01654277 -0.02086162 -0.01456203 -0.01390197]
 [ 0.02800459 -0.57889136 -0.03770718 -0.04978592 -0.01377045 -0.06725929
  -

In [41]:
# One-hot encode categorical features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
X_train_cat = one_hot_encoder.fit_transform(X_train_cat).toarray()
X_test_cat = one_hot_encoder.transform(X_test_cat).toarray()

# Print the first few rows of training data
print("Training Data:")
print("Features:")
print(X_test_cat[:5])  # Print first 5 rows of features

# Print the first few rows of test data
print("\nTest Data:")
print("Features:")
print(X_test_cat[:5])  # Print first 5 rows of features


Training Data:
Features:
[[1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [42]:
# Combine numerical and categorical features
X_train = np.hstack((X_train_num, X_train_cat))
X_test = np.hstack((X_test_num, X_test_cat))


In [43]:
# Convert to PyTorch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)
y_test = torch.from_numpy(y_test.astype(np.float32)).view(-1, 1)

# Print the first few rows of training data
print("Training Data:")
print("Features:")
print(X_train[:5])  # Print first 5 rows of features
print("Labels:")
print(y_train[:5])  # Print first 5 labels

# Print the first few rows of test data
print("\nTest Data:")
print("Features:")
print(X_test[:5])  # Print first 5 rows of features
print("Labels:")
print(y_test[:5])  # Print first 5 labels

Training Data:
Features:
tensor([[ 0.8248,  1.7714,  0.0100, -0.0451, -0.0137, -0.0673, -0.0128, -0.0164,
         -0.0129, -0.0138,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000],
        [ 0.2727, -0.5789, -0.0377, -0.0475, -0

In [44]:
class LogisticRegression(nn.Module):

    def __init__(self, input_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        # self.linear2 = nn.Linear(8, 4)
        # self.linear3 = nn.Linear(4, 1)
        nn.init.xavier_uniform_(self.linear.weight)  # Initialize weights

    def forward(self, x):
        logits = self.linear(x)
        # logits = self.linear2(logits)
        # logits = self.linear3(logits)
        y_predicted = torch.sigmoid(logits)
        return y_predicted
    
# Initialize model
n_features = X_train.shape[1]
model = LogisticRegression(n_features)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [45]:
# Training loop
num_epochs = 100

for epoch in range(num_epochs):
    # Forward pass and loss
    y_predicted = model(X_train)
    loss = criterion(y_predicted, y_train)
    
    # Backward pass
    loss.backward()
    
    # Updates
    optimizer.step()
    
    # Zero gradients
    optimizer.zero_grad()
    
    if (epoch+1) % 10 == 0:
        y_predicted = model(X_test)
        # Evaluate model
        with torch.no_grad():
            y_predicted_cls = torch.round(y_predicted)  # Round off to nearest class
            accuracy = (y_predicted_cls == y_test).sum().item() / len(y_test)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}')


Epoch [10/100], Loss: 0.6393, Accuracy: 0.7668
Epoch [20/100], Loss: 0.6141, Accuracy: 0.7793
Epoch [30/100], Loss: 0.5900, Accuracy: 0.7800
Epoch [40/100], Loss: 0.5670, Accuracy: 0.7856
Epoch [50/100], Loss: 0.5451, Accuracy: 0.7925
Epoch [60/100], Loss: 0.5241, Accuracy: 0.7974
Epoch [70/100], Loss: 0.5042, Accuracy: 0.7988
Epoch [80/100], Loss: 0.4852, Accuracy: 0.7994
Epoch [90/100], Loss: 0.4672, Accuracy: 0.8008
Epoch [100/100], Loss: 0.4500, Accuracy: 0.8154
