In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch import Tensor
from torchsummary import summary
import random
from numpy import load
from tqdm import tqdm, trange
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import pickle
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
%run './Attention_based_model.ipynb'

Tue Jul  9 13:27:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off | 00000000:01:00.0  On |                  N/A |
|  0%   42C    P8              13W / 320W |   1639MiB / 16376MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [10]:
# check the availability of cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device, f"({torch.cuda.get_device_name(device)})" if torch.cuda.is_available() else "")
with open('CIC2018-dataset-all-new.pkl', 'rb') as f:
    train_data = pickle.load(f)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True, pin_memory=True)

Using device:  cuda (NVIDIA GeForce RTX 4080)


In [11]:
data_points = []
labels = []

with torch.no_grad():
    # progress bar
    data_iter = tqdm.tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        bar_format="{l_bar}{r_bar}"
    )
    for i, data in data_iter:  # Assuming you have an inference data loader
        data = {key: value.to(device) for key, value in data.items()}
        inputs, label = data["netformer_input"], data["sequence_label"]
        
        inputs = inputs.cpu().numpy()
        label = label.cpu().numpy()
        
        # Collect the data points and labels
        for j in range(inputs.shape[0]):
            data_points.append(inputs[j])
            labels.append(label[j])

# Convert lists to numpy arrays
data_points_np = np.array(data_points)
labels_np = np.array(labels)

# Print shapes to verify
print('Data points shape:', data_points_np.shape)
print('Labels shape:', labels_np.shape)

print(Counter(labels_np))


100%|| 1281/1281 [00:03<00:00, 384.69it/s]

Data points shape: (10246, 2000)
Labels shape: (10246,)
Counter({0.0: 4715, 1.0: 4236, 2.0: 862, 4.0: 430, 3.0: 2, 5.0: 1})





In [12]:
# Select data points with class [0, 1, 2, 4]
selected_classes = [0, 1, 2, 4]
relabel_map = {0: 0, 1: 1, 2: 2, 4: 3}

selected_indices = np.where(np.isin(labels_np, selected_classes))[0]

selected_data = data_points_np[selected_indices]
selected_labels = labels_np[selected_indices]

# Relabel
for old_label, new_label in relabel_map.items():
    selected_labels[selected_labels == old_label] = new_label

# Ensure 125 examples for each class

sample_numeber_list = [10,20, 30, 50, 80, 125, 200, 300, 400]
for sample in sample_numeber_list:
    final_data = []
    final_labels = []
    for class_label in relabel_map.values():
        class_indices = np.where(selected_labels == class_label)[0]
        chosen_indices = np.random.choice(class_indices, sample, replace=False)
        final_data.append(selected_data[chosen_indices])
        final_labels.append(selected_labels[chosen_indices])
    
    final_data = np.concatenate(final_data)
    final_labels = np.concatenate(final_labels)
    
    # Final data and labels shapes
    print("Final Data points shape:", final_data.shape)
    print("Final Labels shape:", final_labels.shape)
    print("Final Counter:", Counter(final_labels))

    data_points = final_data 
    labels = final_labels
    
    # Data preprocessing
    scaler = StandardScaler()
    data_points = scaler.fit_transform(data_points)
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data_points, labels, test_size=0.2, random_state=42)
    
    # Define the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # Training the model
    rf_model.fit(X_train, y_train)
    
    # Evaluating the model
    y_pred = rf_model.predict(X_test)
    y_prob = rf_model.predict_proba(X_test)
    
    # Calculate metrics
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Since roc_auc_score is not directly applicable to multi-class without specifying an average method, it's typically used for binary classification. For multi-class, we might use other metrics or one-vs-rest strategy.
    # Here we will use the average='macro' method to calculate AUC for each class and average them
    auc = roc_auc_score(y_test, y_prob, multi_class='ovo', average='macro')
    
    print(f'Confusion Matrix:\n{conf_matrix}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'AUC: {auc:.4f}')
    print(classification_report(y_test, y_pred))


Final Data points shape: (40, 2000)
Final Labels shape: (40,)
Final Counter: Counter({0.0: 10, 1.0: 10, 2.0: 10, 3.0: 10})
Confusion Matrix:
[[1 0 0 0]
 [3 1 0 0]
 [0 1 1 0]
 [0 0 0 1]]
Accuracy: 0.5000
Precision: 0.6562
Recall: 0.5000
F1 Score: 0.5083
AUC: 0.7917
              precision    recall  f1-score   support

         0.0       0.25      1.00      0.40         1
         1.0       0.50      0.25      0.33         4
         2.0       1.00      0.50      0.67         2
         3.0       1.00      1.00      1.00         1

    accuracy                           0.50         8
   macro avg       0.69      0.69      0.60         8
weighted avg       0.66      0.50      0.51         8

Final Data points shape: (80, 2000)
Final Labels shape: (80,)
Final Counter: Counter({0.0: 20, 1.0: 20, 2.0: 20, 3.0: 20})
Confusion Matrix:
[[4 1 0 0]
 [0 5 0 1]
 [0 0 2 0]
 [0 1 1 1]]
Accuracy: 0.7500
Precision: 0.7574
Recall: 0.7500
F1 Score: 0.7412
AUC: 0.8792
              precision    recall  

Confusion Matrix:
[[25  5  1  0]
 [ 0 15  0  0]
 [ 0  0 24  0]
 [ 0  0  0 30]]
Accuracy: 0.9400
Precision: 0.9529
Recall: 0.9400
F1 Score: 0.9405
AUC: 0.9995
              precision    recall  f1-score   support

         0.0       1.00      0.81      0.89        31
         1.0       0.75      1.00      0.86        15
         2.0       0.96      1.00      0.98        24
         3.0       1.00      1.00      1.00        30

    accuracy                           0.94       100
   macro avg       0.93      0.95      0.93       100
weighted avg       0.95      0.94      0.94       100

