In [4]:
import sys
import torch
print("Python version:", sys.version)
print("Torch version:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import NearestNeighbors
from torch_geometric.data import Data
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

Python version: 3.9.21 | packaged by conda-forge | (main, Dec  5 2024, 13:41:22) [MSC v.1929 64 bit (AMD64)]
Torch version: 2.5.1
CUDA available? True


In [7]:
# Paths
TRAIN_PATH = "UNSW_NB15_training-set.csv"
TEST_PATH  = "UNSW_NB15_testing-set.csv"

# Read CSVs
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

# NaN in attack_cat
train_df = train_df.assign(attack_cat=train_df['attack_cat'].fillna('Normal'))
test_df = test_df.assign(attack_cat=test_df['attack_cat'].fillna('Normal'))

# Encode 'attack_cat' as the label
attack_cat_encoder = LabelEncoder()
train_df['attack_cat'] = attack_cat_encoder.fit_transform(train_df['attack_cat'])
test_df['attack_cat'] = attack_cat_encoder.transform(test_df['attack_cat'])

# Handling cols better
cat_cols = ['proto', 'service', 'state']
for col in cat_cols:
    # Get unique values from both train and test
    combined_categories = pd.concat([train_df[col], test_df[col]]).unique()
    le = LabelEncoder()
    le.fit(combined_categories.astype(str))
    
    # Transform separately
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

# Drop unused cols
drop_cols = ['id', 'label']
for c in drop_cols:
    if c in train_df.columns:
        train_df.drop(columns=c, inplace=True)
    if c in test_df.columns:
        test_df.drop(columns=c, inplace=True)

feature_cols = [c for c in train_df.columns if c != 'attack_cat']
X_train = train_df[feature_cols].values
y_train = train_df['attack_cat'].values
X_test = test_df[feature_cols].values
y_test = test_df['attack_cat'].values

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Train X shape:", X_train.shape, "y shape:", y_train.shape)
print("Test  X shape:", X_test.shape,  "y shape:", y_test.shape)


Train X shape: (175341, 42) y shape: (175341,)
Test  X shape: (82332, 42) y shape: (82332,)


In [8]:
k = 5

knn = NearestNeighbors(n_neighbors=k)
knn.fit(X_train)
# This gives us the indices of the neighbors
neighbors = knn.kneighbors(X_train, return_distance=False)

# Build edge list from the neighbors array
edge_source = []
edge_target = []
for i in range(len(neighbors)):
    for j in neighbors[i]:
        # skip self-loops if you want
        if i != j:
            edge_source.append(i)
            edge_target.append(j)

edge_index = np.vstack((edge_source, edge_target))
edge_index = torch.tensor(edge_index, dtype=torch.long)

print("edge_index shape:", edge_index.shape)


edge_index shape: torch.Size([2, 752187])


In [9]:
# Create the train data object
x_train_tensor = torch.tensor(X_train, dtype=torch.float)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
train_data = Data(x=x_train_tensor, y=y_train_tensor, edge_index=edge_index)

print(train_data)
print("Train data: #nodes =", train_data.num_nodes, 
      "#edges =", train_data.num_edges, 
      "num_features =", train_data.num_features)

# For the test set, similarly create a separate graph:
knn_test = NearestNeighbors(n_neighbors=k)
knn_test.fit(X_test)
neighbors_test = knn_test.kneighbors(X_test, return_distance=False)

edge_source_test = []
edge_target_test = []
for i in range(len(neighbors_test)):
    for j in neighbors_test[i]:
        if i != j:
            edge_source_test.append(i)
            edge_target_test.append(j)

edge_index_test = np.vstack((edge_source_test, edge_target_test))
edge_index_test = torch.tensor(edge_index_test, dtype=torch.long)

x_test_tensor = torch.tensor(X_test, dtype=torch.float)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
test_data = Data(x=x_test_tensor, y=y_test_tensor, edge_index=edge_index_test)

print(test_data)
print("Test data: #nodes =", test_data.num_nodes, 
      "#edges =", test_data.num_edges, 
      "num_features =", test_data.num_features)


Data(x=[175341, 42], edge_index=[2, 752187], y=[175341])
Train data: #nodes = 175341 #edges = 752187 num_features = 42
Data(x=[82332, 42], edge_index=[2, 343691], y=[82332])
Test data: #nodes = 82332 #edges = 343691 num_features = 42


In [13]:
# class GCN(torch.nn.Module):
#     def __init__(self, num_features, hidden_channels, num_classes):
#         super().__init__()
#         self.conv1 = GCNConv(num_features, hidden_channels)
#         self.conv2 = GCNConv(hidden_channels, num_classes)
        
#     def forward(self, x, edge_index):
#         # 1) First layer
#         x = self.conv1(x, edge_index)
#         x = F.relu(x)
#         # 2) Second layer (logits)
#         x = self.conv2(x, edge_index)
#         return x

class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, num_classes)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.conv3(x, edge_index)
        return x


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Create model
num_features = train_data.num_features
num_classes = len(np.unique(y_train))  # number of distinct attack_cat classes
hidden_channels = 64

model = GCN(num_features, hidden_channels, num_classes).to(device)
train_data = train_data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(train_data.x, train_data.edge_index)
    loss = F.cross_entropy(out, train_data.y)
    loss.backward()
    optimizer.step()
    return loss.item()

epochs = 10
for epoch in range(1, epochs+1):
    loss_val = train()
    print(f"Epoch: {epoch:02d}, Loss: {loss_val:.4f}")


Using device: cuda
Epoch: 01, Loss: 2.5330
Epoch: 02, Loss: 1.7841
Epoch: 03, Loss: 1.4590
Epoch: 04, Loss: 1.2525
Epoch: 05, Loss: 1.1291
Epoch: 06, Loss: 1.0703
Epoch: 07, Loss: 1.0292
Epoch: 08, Loss: 0.9823
Epoch: 09, Loss: 0.9447
Epoch: 10, Loss: 0.9238


In [15]:
# Move test data to device
test_data = test_data.to(device)

model.eval()
with torch.no_grad():
    logits = model(test_data.x, test_data.edge_index)
    pred = logits.argmax(dim=1)
    correct = (pred == test_data.y).sum()
    acc = int(correct) / test_data.num_nodes

print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.5634
