# 載入 package

In [1]:
%pip install -q --upgrade transformers
%pip install -q -U accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize

import pandas as pd
import re
import numpy as np
from numpy import inf
from tqdm import tqdm
import time

import torch
import torch.nn as nn
from torch import cuda
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset, RandomSampler

from accelerate import Accelerator
from transformers import get_scheduler

from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import accuracy_score, balanced_accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


# 載入模型

In [5]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased', add_special_tokens=True)
model = AutoModel.from_pretrained('/content/drive/Sess_MLM/distilbert_sessMLM') # 載入之前在 Feature-based further pre-traini 過的 SessMLM
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [6]:
tokenizer.is_fast

True

# 載入資料

## 2022/11 train

In [8]:
df_11_train = pd.read_pickle("/content/ap_table_202211_abnormal_train.pkl")

## 2022/11 - evaluation data

In [10]:
df_val = pd.read_pickle("/content/ap_table_202211_abnormal_val.pkl")

# Reset Index

In [11]:
df_11_train = df_11_train.reset_index(drop=True).sample(frac=1).reset_index(drop=True)

In [12]:
df_val = df_val.reset_index(drop = True)

# Segmentaion Funtion

In [13]:
def tokenization_addspace_new(s):
  word_list = wordpunct_tokenize(s.replace('[SEP]', 'Æ ')) # [SEP] 用 Æ 取代
  new_string = ""
  for i in range(len(word_list)):
    temp = re.split('([^a-zA-Z0-9])',word_list[i]) # 用特殊字元做文字分割
    for j in range(len(temp)):
      if temp[j] != "":  # 如果不是空字元
        if j != len(temp) - 2 and j != len(temp) - 1: # 如果不是最後兩個字元，則加空格
          if temp[j] == 'Æ':
            new_string += '[SEP]' + " " # 把 Æ 換回成 [SEP]
          else:
            new_string += temp[j] + " "
        else:
          if temp[j] == 'Æ':
            new_string += '[SEP]'
          else:
            new_string += temp[j]
    if i != len(word_list) - 1:
      new_string += " "  # 加空格
  return new_string.lower()  # 全部變小寫

# 文字前處理

In [14]:
df_11_train['text'] = ""
df_11_train['text'] = df_11_train['clean_payload_list'].apply(lambda x: tokenization_addspace_new(x))

In [15]:
df_val['text'] = ""
df_val['text'] = df_val['clean_payload_list'].apply(lambda x: tokenization_addspace_new(x))

In [16]:
df_11_train = df_11_train[df_11_train.text != ""].reset_index(drop=True).sample(frac=1).reset_index(drop=True)

In [17]:
df_val = df_val[df_val.text != ""].reset_index(drop=True).sample(frac=1).reset_index(drop=True)

In [18]:
print("2022/11 train: ", len(df_11_train))

2022/11 train:  140648


# 建立模型

In [19]:
class BERTClass(torch.nn.Module):
    def __init__(self, num_labels):
        super(BERTClass, self).__init__()
        self.l1 = model  # SessMLM 的 DistilBERT
        self.layer = nn.Sequential(  # Classifier
            nn.Linear(768, 768),
            nn.Tanh(),
            nn.Dropout(0.1),
            nn.Linear(768, num_labels),
            )

    def forward(self, ids, mask):
        outputs = self.l1(ids, attention_mask = mask, return_dict = True,  output_hidden_states=True)
        output = self.layer(outputs.last_hidden_state[:,0,:])  # 取 DistilBERT 最後一層 output 的 CLS 作為 Classifier 的 input
        return output

In [20]:
e2e_model = BERTClass(14)

# Label encoding

In [21]:
category_list = [["Reconnaissance (TA0043)"], ["Resource Development (TA0042)"], ["Initial Access (TA0001)"], ["Execution (TA0002)"], ["Persistence (TA0003)"],
["Privilege Escalation (TA0004)"], ["Defense Evasion (TA0005)"], ["Credential Access (TA0006)"], ["Discovery (TA0007)"], ["Lateral Movement (TA0008)"],
["Collection (TA0009)"], ["Command and Control (TA0011)"], ["Exfiltration (TA0010)"], ["Impact (TA0040)"]]

In [22]:
encoder = OrdinalEncoder(dtype = int)
encoder.fit(category_list)

In [23]:
encoder.categories_

[array(['Collection (TA0009)', 'Command and Control (TA0011)',
        'Credential Access (TA0006)', 'Defense Evasion (TA0005)',
        'Discovery (TA0007)', 'Execution (TA0002)',
        'Exfiltration (TA0010)', 'Impact (TA0040)',
        'Initial Access (TA0001)', 'Lateral Movement (TA0008)',
        'Persistence (TA0003)', 'Privilege Escalation (TA0004)',
        'Reconnaissance (TA0043)', 'Resource Development (TA0042)'],
       dtype=object)]

In [24]:
df_11_class_train = df_11_train
df_11_class_train['Tactic_concate_list'] = df_11_class_train.Tactic_concate.apply(lambda x: [x])
df_11_class_train['category'] = encoder.transform(df_11_class_train['Tactic_concate_list'].tolist())
df_11_class_train.head()

Unnamed: 0,Session_ID,Src_ISP,Datetime,Protocol,city,country,isp,iso_code,domain,session_time,...,pkt_num,dist,AP_name,new_observed,sess_num,pot,Tactic_concate,text,Tactic_concate_list,category
0,20221107_中華電信_http_1667790821.763175000,中華電信,2022-11-07 11:13:41,http,,United States,Hurricane Electric,US,,1667791000.0,...,1,0.0003538728,20221020_15_http_01_73,False,1,amun,Reconnaissance (TA0043),get / favicon . ico http / version [sep] host ...,[Reconnaissance (TA0043)],12
1,20221107_中嘉寬頻_http_1667791283.477593000,中嘉寬頻,2022-11-07 11:21:23,http,,Singapore,,SG,,1667791000.0,...,1,0.0,20221107_12_http_11_0,True,1,glastopf,Initial Access (TA0001),http / version 200 ok [sep] pragma : no - cach...,[Initial Access (TA0001)],8
2,20221107_中嘉寬頻_http_1667792183.600434000,中嘉寬頻,2022-11-07 11:36:23,http,,South Korea,Korea Telecom,KR,,1667792000.0,...,1,0.0,20221107_12_http_11_3,True,1,glastopf,Command and Control (TA0011),http / version 403 forbidden [sep] x - frame -...,[Command and Control (TA0011)],1
3,20221126_台灣固網_http_1669477467.390553000,台灣固網,2022-11-26 23:44:27,http,,Canada,OVH SAS,CA,,1669477000.0,...,1,0.0,20221115_07_http_01_57,False,1,glastopf,Reconnaissance (TA0043),get / login / exchange / web http / version [s...,[Reconnaissance (TA0043)],12
4,20221107_中嘉寬頻_http_1667777557.090568000,中嘉寬頻,2022-11-07 07:32:37,http,Melbourne,Australia,Over The Wire Pty,AU,otw.net.au,1667778000.0,...,1,1.192093e-07,20221107_12_http_11_48,True,1,glastopf,Initial Access (TA0001),/ div > [sep] < / body > [sep] < / html >,[Initial Access (TA0001)],8


In [25]:
df_val_class = df_val
df_val_class['Tactic_concate_list'] = df_val_class.Tactic_concate.apply(lambda x: [x])
df_val_class['category'] = encoder.transform(df_val_class['Tactic_concate_list'].tolist())
df_val_class.head()

Unnamed: 0,Session_ID,Src_ISP,Datetime,Protocol,city,country,isp,iso_code,domain,session_time,...,pkt_num,dist,AP_name,new_observed,sess_num,pot,Tactic_concate,text,Tactic_concate_list,category
0,20221107_台灣固網_http_1667790159.162222000,台灣固網,2022-11-07 11:02:39,http,,Canada,OVH SAS,CA,,1667790000.0,...,1,0.0,20221107_12_http_02_26,True,1,glastopf,Reconnaissance (TA0043),get / login / includes / zebra . conf http / v...,[Reconnaissance (TA0043)],12
1,20221124_台灣固網_http_1669302180.380663000,台灣固網,2022-11-24 23:03:00,http,,United Kingdom,Next Vision Ltd,GB,,1669302000.0,...,1,0.0,20221107_18_http_01_30,False,1,amun,Execution (TA0002),get / index . php ? s = / index / \ think \ ap...,[Execution (TA0002)],5
2,20221107_中嘉寬頻_http_1667792825.347972000,中嘉寬頻,2022-11-07 11:47:05,http,Chihuahua City,Mexico,Universidad Autonoma De Chihuahua,MX,,1667793000.0,...,1,0.0,20221107_12_http_11_12,True,1,glastopf,Command and Control (TA0011),http / version 503 service unavailable [sep] c...,[Command and Control (TA0011)],1
3,20221125_台灣固網_http_1669347948.611410000,台灣固網,2022-11-25 11:45:48,http,Amsterdam,Netherlands,SpectraIP B.V.,NL,alsycon.net,1669348000.0,...,1,1.192093e-07,20221125_12_http_02_3,True,1,glastopf,Credential Access (TA0006),get / ? q = ultrasurf http / version [sep] hos...,[Credential Access (TA0006)],2
4,20221116_台灣固網_http_1668611792.955832000,台灣固網,2022-11-16 23:16:32,http,,Russia,Petersburg Internet Network ltd.,RU,,1668612000.0,...,1,0.0,20221108_20_http_03_1,False,1,amun,Credential Access (TA0006),get http : / / ip / echo . php http / version ...,[Credential Access (TA0006)],2


# 建立 dataset

In [26]:
def get_freq(labels):  # 計算各 label 次數
  class_count = np.zeros((len(encoder.categories_[0])))
  for i in range(len(encoder.categories_[0])):
    for j in range(len(labels)):
      if labels[j] == i:
        class_count[i] += 1
  return class_count

In [27]:
def cal_weight(labels):  # 由次數轉換成 weight
  freq = 0
  freq += get_freq(labels)
  freq = freq / np.sum(freq)
  weight = np.median(freq) / freq
  weight[weight == inf] = 0
  return weight

In [None]:
labels = np.array(df_11_class_train['category'].values.ravel())
class_weights = torch.FloatTensor(cal_weight(labels)).to(device)  # 得到 class weight

In [28]:
def make_train_dataset(tokenizer, train):  # 建立 training dataset
  train_tokens = tokenizer.batch_encode_plus(train['text'].to_list(), max_length = 512, padding="max_length", truncation = True, return_token_type_ids=True)

  train_seq = torch.tensor(train_tokens['input_ids'])
  train_mask = torch.tensor(train_tokens['attention_mask'])
  # train_token = torch.tensor(train_tokens['token_type_ids'])
  train_y = torch.tensor(train['category'].to_list())

  train_data = TensorDataset(train_seq, train_mask, train_y)
  train_sampler = RandomSampler(train_data)
  trainloader = DataLoader(train_data,
                          sampler = train_sampler,
                          batch_size = 32, drop_last=True)

  return trainloader

In [29]:
def make_val_dataset(tokenizer, train):  # 建立 test dataset
  train_tokens = tokenizer.batch_encode_plus(train['text'].to_list(), max_length = 512, padding="max_length", truncation = True, return_token_type_ids=True)

  train_seq = torch.tensor(train_tokens['input_ids'])
  train_mask = torch.tensor(train_tokens['attention_mask'])
  # train_token = torch.tensor(train_tokens['token_type_ids'])
  train_y = torch.tensor(train['category'].to_list())

  train_data = TensorDataset(train_seq, train_mask, train_y)
  train_sampler = RandomSampler(train_data)
  trainloader = DataLoader(train_data,
                          sampler = train_sampler,
                          batch_size = 128, drop_last=False)

  return trainloader

In [31]:
trainloader = make_train_dataset(tokenizer, df_11_class_train)

In [32]:
valloader = make_val_dataset(tokenizer, df_val_class)

# 訓練模型

In [33]:
learning_rate = 5e-5
num_epochs = 4

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [34]:
num_update_steps_per_epoch = len(trainloader)
num_training_steps = num_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_training_steps*0.1,
    num_training_steps=num_training_steps,
)

In [35]:
accelerator = Accelerator()
e2e_model, optimizer, criterion, lr_scheduler, train_dataloader, val_dataloader = accelerator.prepare(
    e2e_model, optimizer, criterion, lr_scheduler, trainloader, valloader
)

In [36]:
e2e_model.train()

BERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

In [37]:
start_time = time.time()
min_val_loss = 999999

for e in range(num_epochs):
  train_loss = 0.0
  e2e_model.train()
  for batch in tqdm(train_dataloader):
      optimizer.zero_grad()

      batch = [i.to(device) for i in batch]
      sent_id, masks, labels = batch

      optimizer.zero_grad()
      preds = e2e_model(sent_id, masks)
      loss = criterion(preds, labels)
      train_loss += loss.item() #計算loss總和

      accelerator.backward(loss)
      lr_scheduler.step()
      optimizer.step()

  print(f'Epoch:{e+1}\t\tTraining Loss: {train_loss / len(train_dataloader)} \n')

  e2e_model.eval()  # Evaluation
  val_loss = 0.0
  val_accu = []
  pred_list = []
  ans_list = []
  with torch.no_grad():
    for batch in tqdm(val_dataloader):

      batch = [i.to(device) for i in batch]
      sent_id, masks, labels = batch
      preds = e2e_model(sent_id, masks)
      val_loss += criterion(preds, labels).item()
      outputs = preds.detach().cpu().numpy()
      pred_list.append(outputs)
      ans_list.append(labels.detach().cpu().numpy())

  pred_list_final =[]

  for i in range(len(pred_list)):
    for j in range(len(pred_list[i])):
      temp = np.argmax(pred_list[i][j])
      pred_list_final.append(temp)

  a = pred_list_final
  b = [e for sl in ans_list for e in sl]

  y_true = np.array(b)
  y_pred = np.array(a)
  final_val = val_loss / len(val_dataloader)
  print()
  print("val loss: {:.4f} ".format(final_val))
  print("accuracy: {:.4f} ".format(accuracy_score(y_true, y_pred)))
  print("balanced accuracy: {:.4f} \n".format(balanced_accuracy_score(y_true, y_pred)))

  if(final_val < min_val_loss):  # 如果 validation loss 小於之前的訓練的話，便將模型儲存
    min_val_loss = final_val

    accelerator.wait_for_everyone()
    unwrapped_e2e_model = accelerator.unwrap_model(e2e_model)
    torch.save(unwrapped_e2e_model.state_dict(), '/content/drive/distilbert_sess_all.pt')
    model.save_pretrained("/content/drive/distilbert_sess_all")

print("\n--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 4395/4395 [25:08<00:00,  2.91it/s]


Epoch:1		Training Loss: 0.3646762638826419 



100%|██████████| 138/138 [00:57<00:00,  2.39it/s]



val loss: 0.1700 
accuracy: 0.9565 
balanced accuracy: 0.8595 



100%|██████████| 4395/4395 [25:08<00:00,  2.91it/s]


Epoch:2		Training Loss: 0.14830769485411519 



100%|██████████| 138/138 [00:57<00:00,  2.39it/s]



val loss: 0.1297 
accuracy: 0.9614 
balanced accuracy: 0.8758 



100%|██████████| 4395/4395 [25:07<00:00,  2.92it/s]


Epoch:3		Training Loss: 0.11859009508658053 



100%|██████████| 138/138 [00:57<00:00,  2.39it/s]



val loss: 0.1187 
accuracy: 0.9631 
balanced accuracy: 0.9481 



100%|██████████| 4395/4395 [25:07<00:00,  2.92it/s]


Epoch:4		Training Loss: 0.10626225494085657 



100%|██████████| 138/138 [00:57<00:00,  2.39it/s]



val loss: 0.1128 
accuracy: 0.9643 
balanced accuracy: 0.9588 


--- 6268.392199039459 seconds ---
