<a href="https://colab.research.google.com/github/CSID-DGU/2024-02-CSC4004-3-2-DeepGuard/blob/dev_jb/bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [17]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk
from prettytable import PrettyTable  # To print in tabular format

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

%matplotlib inline

# Reading data

In [18]:
# Creating a empty dict, where I will save all parameters required for test data transformation

saved_dict = {}

In [19]:
# Reading datasets
dfs = []
for i in range(1,5):
    path = '/content/UNSW_NB15_training-set.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

In [20]:
# This csv file contains names of all the features
df_col = pd.read_csv('/content/NUSW-NB15_features.csv', encoding='ISO-8859-1')

In [21]:
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

In [22]:
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name'][:45]

In [23]:
# Saving useful info, later this will be used to transform raw test data
saved_dict['columns'] = df_col['Name'][df_col['Name']!='label'].tolist()

In [24]:
del df_col

In [25]:
all_data.shape

(329332, 45)

In [26]:
all_data.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm
0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
1,1,0.000011,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
2,2,0.000008,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
3,3,0.000005,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
4,4,0.000006,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0


# 데이터 로드 및 변환

In [27]:
path = '/content/UNSW_NB15_training-set.csv'
df = pd.read_csv(path)

# 각 행의 모든 열 데이터를 공백으로 구분된 문자열로 변환
train_data = df.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# 변환된 데이터 출력
print("Train Data:")
print(train_data)

# 파일로 저장하고 싶은 경우
train_data.to_csv('train_data.csv', index=False, header=False)

if 'text' not in df.columns:
    df['text'] = df.iloc[:,0]
#데이터 분리
X = df['text']
if 'attack_cat' in df.columns:
    y = df['attack_cat']
else:
  y = pd.Series([0] * len(df), index=df.index)

  # 데이터셋을 훈련 세트와 테스트 세트로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#데이터 분리
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Train Data:
0        1 1.1e-05 udp - INT 2 0 496 0 90909.0902 254 0...
1        2 8e-06 udp - INT 2 0 1762 0 125000.0003 254 0...
2        3 5e-06 udp - INT 2 0 1068 0 200000.0051 254 0...
3        4 6e-06 udp - INT 2 0 900 0 166666.6608 254 0 ...
4        5 1e-05 udp - INT 2 0 2126 0 100000.0025 254 0...
                               ...                        
82327    82328 5e-06 udp - INT 2 0 104 0 200000.0051 25...
82328    82329 1.106101 tcp - FIN 20 8 18062 354 24.410...
82329    82330 0.0 arp - INT 1 0 46 0 0.0 0 0 0.0 0.0 0...
82330    82331 0.0 arp - INT 1 0 46 0 0.0 0 0 0.0 0.0 0...
82331    82332 9e-06 udp - INT 2 0 104 0 111111.1072 25...
Length: 82332, dtype: object
Training set size: 74098
Test set size: 8234


# 분류기 설정 및 학습

In [28]:
!pip install transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the BART tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=len(y.unique()))

batch_size = 32

def batch_tokenize(text_list, batch_size=32):
    encodings = {'input_ids': [], 'attention_mask': []}
    for i in range(0, len(text_list), batch_size):
        # Convert batch elements to strings before tokenization
        batch = [str(text) for text in text_list[i:i+batch_size]]
        batch_encodings = tokenizer(
            batch, truncation=True, padding='max_length', max_length=128)
        encodings['input_ids'].extend(batch_encodings['input_ids'])
        encodings['attention_mask'].extend(batch_encodings['attention_mask'])
    return encodings

train_encodings = batch_tokenize(X_train.tolist(), batch_size=batch_size)
test_encodings = batch_tokenize(X_test.tolist(), batch_size=batch_size)



Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
!pip install torch
!pip install transformers torch datasets
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

# Prepare dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        if isinstance(labels[0], str):
            self.label_encoder = LabelEncoder()
            self.labels = self.label_encoder.fit_transform(labels)
        else:
            self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = Dataset(train_encodings, y_train.tolist())
test_dataset = Dataset(test_encodings, y_test.tolist())

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print("\nSample from the test dataset:")
for i in range(3):
    print(test_dataset[i])

Train dataset size: 74098
Test dataset size: 8234

Sample from the test dataset:
{'input_ids': tensor([    0,  3170, 29312,     2,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     

In [30]:
!pip install wandb -qqq
!pip install transformers

import wandb
from transformers import TrainingArguments, Trainer
import os

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    report_to="none"
)

import random
!pip install wandb -qqq
!pip install transformers

import wandb
from transformers import TrainingArguments, Trainer
import os

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    report_to="none"
)

import random
from torch.utils.data import Subset

sample_size = int(len(train_dataset) * 0.2) #Corrected line
random_indices = random.sample(range(len(train_dataset)), sample_size)
train_dataset = Subset(train_dataset, random_indices)

# Initialize Trainer with optimized settings
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Start training
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.812082
2,No log,0.730211
3,No log,0.69669


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=348, training_loss=0.9528066700902479, metrics={'train_runtime': 21067.512, 'train_samples_per_second': 2.11, 'train_steps_per_second': 0.017, 'total_flos': 3408808843464192.0, 'train_loss': 0.9528066700902479, 'epoch': 3.0})

In [31]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Evaluate the model using the test dataset
test_results = trainer.evaluate()

# 3. 예측 진행 및 결과 확인
predictions = trainer.predict(test_dataset)

# Extract predictions based on the structure of PredictionOutput
logits = predictions.predictions[0]  # Assuming predictions is a tuple and the first element is the logits/probabilities

# Check shape and content of logits
print("Shape of logits:", logits.shape)
print("First few elements of logits:", logits[:5])

# Try to adapt predictions based on the model's actual output:
# If the output contains generated text, process it for classification first.
# For example, if it outputs probabilities for each class:

# pred_labels = np.argmax(logits, axis=-1) # original line
try:
    pred_labels = np.argmax(logits, axis=-1) # 예측된 라벨
except ValueError:
    print("Predictions are not in expected format. Attempting to fix...")
    # Assuming you get probabilities in logits:
    # Reshape to (num_samples, num_classes) if necessary
    num_samples = logits.shape[0]
    num_classes = model.config.num_labels
    if logits.shape != (num_samples, num_classes):
        logits = logits.reshape(num_samples, num_classes)
    pred_labels = np.argmax(logits, axis=-1) # 예측된 라벨

true_labels = predictions.label_ids  # 실제 라벨

print("\nSample of predicted labels:")
print(pred_labels[:10])  # 예측 라벨의 첫 10개 확인
print("\nSample of true labels:")
print(true_labels[:10])  # 실제 라벨의 첫 10개 확인

# 정확도 계산
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy:.4f}")

# 5. 분류 리포트
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Shape of logits: (8234, 10)
First few elements of logits: [[-0.9454375  -1.3200552   0.31958053  0.31029183 -0.30421838 -1.0805866
   7.3548393  -0.50673056 -1.0897467  -2.1031249 ]
 [-0.43983078 -0.7707858  -0.03807732  1.6132381   1.5618694   2.7641957
  -2.3768482   0.16739221 -1.4480834  -2.8260398 ]
 [-0.8856015  -1.1621583  -0.09772405 -0.25729465 -0.2759677   6.3111935
  -1.1420863  -0.6079503  -0.8748362  -1.8644938 ]
 [-0.97672486 -1.1959221  -0.06396157  0.1391217  -0.51275593 -0.78947926
   7.3440437  -0.33033532 -1.1158454  -2.066278  ]
 [-0.85423386 -0.9021393   2.0610964   2.9820008   0.8621483   0.22778134
  -3.2783751   1.8100181  -1.5928679  -2.1706681 ]]

Sample of predicted labels:
[6 5 5 6 3 5 6 4 6 3]

Sample of true labels:
[6 5 5 6 3 5 6 3 6 3]
Accuracy: 0.7490

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        65
           1       0.00      0.00      0.00        59
           2      