In [41]:
# Make sure to run this cell to use torchmetrics. If you cannot use pip install to install the torchmetrics, you can use sklearn.
!pip install torchmetrics

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [42]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.nn.functional as functional
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torchmetrics import Accuracy
# from sklearn.metrics import accuracy_score  # uncomment to use sklearn

In [43]:
# Load preprocessed data
train_df = pd.read_csv('labelled_train.csv')
val_df = pd.read_csv('labelled_validation.csv')

# View the first 5 rows of training set
train_df.head()

Unnamed: 0,processId,threadId,parentProcessId,userId,mountNamespace,argsNum,returnValue,sus_label
0,381,7337,1,100,4026532231,5,0,1
1,381,7337,1,100,4026532231,1,0,1
2,381,7337,1,100,4026532231,0,0,1
3,7347,7347,7341,0,4026531840,2,-2,1
4,7347,7347,7341,0,4026531840,4,0,1


In [44]:
# Extract x (features) and y (labels)
x_train = train_df.drop(columns=['sus_label'])
y_train = train_df['sus_label']

x_val = val_df.drop(columns=['sus_label'])
y_val = val_df['sus_label']

In [45]:
# Standardize x using StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

In [46]:
# Convert to PyTorch tensors
x_train_tensor = torch.tensor(x_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

x_val_tensor = torch.tensor(x_val_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

In [47]:
# Define the model
class BinaryClassificationModel(nn.Module):
    def __init__(self, input_dim):
        super(BinaryClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.dropout = nn.Dropout(0.5)
        self.fc3 = nn.Linear(32, 2)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x


In [48]:
# Initialize the model
input_dim = x_train_tensor.shape[1]
model = BinaryClassificationModel(input_dim)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [49]:
# Training loop
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass for the entire dataset
    outputs = model(x_train_tensor)
    loss = loss_function(outputs, y_train_tensor)
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [1/10], Loss: 0.7135
Epoch [2/10], Loss: 0.7075
Epoch [3/10], Loss: 0.7016
Epoch [4/10], Loss: 0.6957
Epoch [5/10], Loss: 0.6900
Epoch [6/10], Loss: 0.6843
Epoch [7/10], Loss: 0.6787
Epoch [8/10], Loss: 0.6732
Epoch [9/10], Loss: 0.6677
Epoch [10/10], Loss: 0.6622


In [50]:
# Evaluate model
model.eval()

# Predictions and accuracy for train, validation, and test
with torch.no_grad():
    y_train_pred = model(x_train_tensor).argmax(dim=1)
    y_val_pred = model(x_val_tensor).argmax(dim=1)

    train_accuracy = (y_train_pred == y_train_tensor).float().mean().item()
    val_accuracy = (y_val_pred == y_val_tensor).float().mean().item()
    
# Print accuracy
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")

Training Accuracy: 0.9983
Validation Accuracy: 0.9958
