In [None]:
! python3 -m pip install transformers

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from numpy import newaxis
import math

import os
import pandas as pd
import torch.nn as nn
from scipy.stats import chi2
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# configuration for training, you should modify these values to get the best performance
config = {
    "num_labels": 5,
    "hidden_dropout_prob": 0.15,
    "hidden_size": 768,
    "max_length": 512,
}

training_parameters = {
    "batch_size": 16,
    "epochs": 10,
    "output_folder": "/kaggle/working",
    "output_file": "model.bin",
    "learning_rate": 2e-5,
    "print_after_steps": 100,
    "save_steps": 5000,

}

## Class for preprocess dataset

In [None]:
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('jackaduma/secBERT') 

    def __getitem__(self, index):
        review = self.df.iloc[index]["text"]
        attack = self.df.iloc[index]["label"]
        attack_dict = {'Injection': 0,
          'Manipulation': 1,
          'Scanning for Vulnerable Software': 2,
          'HTTP abusion': 3,
          'Fake the Source of Data': 4}
        label = attack_dict[attack]
        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length = 512,
                padding="max_length",
                return_overflowing_tokens=True,
                truncation = True,
            )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None

        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None



        data_input = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "label": torch.tensor(label),
        }

        return data_input["input_ids"], data_input["attention_mask"], data_input["token_type_ids"], data_input["label"]



    def __len__(self):
        return self.df.shape[0]

## Import dataset include source dataset and target dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/srbh2020-v2/dataset_capec_combine (1).csv')
df_train.head()

In [None]:
df_train['label'] = df_train['category']
df_train = df_train[(df_train['label'] != 'Normal')]

In [None]:
from sklearn.model_selection import train_test_split
## prepare for training
X_train, X_test, Y_train, Y_test = train_test_split(df_train['text'], df_train['label'],test_size=0.3, stratify=df_train['label'], shuffle = True)
df_train = pd.concat([X_train, Y_train], axis=1)
df_test = pd.concat([X_test, Y_test], axis=1)
df_train = df_train[0:len(df_train)//training_parameters['batch_size']*training_parameters['batch_size']]
source_dataset = ReviewDataset(df_train)
source_dataloader = DataLoader(dataset = source_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

In [None]:
df_train['label'].value_counts()

## Create model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class SecBertModel(nn.Module):
    def __init__(self):
        super(SecBertModel, self).__init__()

        num_labels = config["num_labels"]
        self.bert = AutoModel.from_pretrained('jackaduma/secBert') # model that we will use
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])

        self.prj = nn.Linear(config["hidden_size"], config["hidden_size"]//2) # giam chieu vector
        self.attack_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"]//2, num_labels),
            nn.LogSoftmax(dim=1),
        )


#       Freeze bert layer
        modules = [self.bert.embeddings, self.bert.encoder.layer[:6]] # freeze all 
        for module in modules:
            for param in module.parameters():
                param.requires_grad = False


    def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          labels=None
          ):

        outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)

        pooled_output_prj = self.prj(pooled_output)
        attack_pred = self.attack_classifier(pooled_output_prj)

        return attack_pred.to(device), pooled_output_prj

In [None]:
def compute_accuracy(logits, labels):
    predicted_labels_dict = {
      0: 0,
      1: 0,
      2: 0,
      3: 0,
      4: 0,
      5: 0,
        6:0
    }

    predicted_label = logits.max(dim = 1)[1]

    for pred in predicted_label:
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()

    return acc, predicted_labels_dict

In [None]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report,accuracy_score, f1_score
import time

def evaluate(model):
    start_time = time.time()
    with torch.no_grad():
        predicted_labels_dict = {
          0: 0,
          1: 0,
          2: 0,
          3: 0,
          4: 0,
          5: 0,
        6:0
        }
        model.eval()
        dataset = ReviewDataset(df_test)
        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

        true_labels = list()
        predicted_label = list()
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            attack_pred, _ = model(**inputs)
            true_labels.extend(inputs['labels'].cpu().numpy())
            predicted_label.extend(attack_pred.max(dim = 1)[1].cpu().numpy())
            _, predicted_labels = compute_accuracy(attack_pred, inputs["labels"])

            for i in range(7):
                  predicted_labels_dict[i] += predicted_labels[i]

        score = f1_score(true_labels,predicted_label,average="macro")
        precision = precision_score(true_labels, predicted_label,average="macro")
        recall = recall_score(true_labels, predicted_label,average="macro")
        report = classification_report(true_labels,predicted_label,digits=4)
        acc= accuracy_score(true_labels, predicted_label)
        #classifaction_report_csv(report,precision,recall,score,0)
        print ('\n clasification report:\n', report)
        print ('F1 score:', score)
        print ('Recall:', recall)
        print ('Precision:', precision)
        print ('Acc:', acc)
        print('Confusion Matrix: \n',confusion_matrix(true_labels, predicted_label))
        print(predicted_labels_dict)
    print("Testing time:", time.time()-start_time)

## Training

In [None]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

In [None]:
%%time

import time
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

model = DomainAdaptationModel()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr)

loss_fn_attack_classifier = torch.nn.NLLLoss()

start_time = time.time()

max_batches = len(source_dataloader)
for epoch_idx in range(2):
    source_iterator = iter(source_dataloader)
    for batch_idx in range(max_batches):

        model.train()
        if(batch_idx%training_parameters["print_after_steps"] == 0 ):
            print("Training Step:", batch_idx)
        optimizer.zero_grad()

        # Souce dataset training update
        input_ids, attention_mask, token_type_ids, labels = next(source_iterator)
        inputs = {
            "input_ids": input_ids.squeeze(axis=1),
            "attention_mask": attention_mask.squeeze(axis=1),
            "token_type_ids" : token_type_ids.squeeze(axis=1),
            "labels" : labels,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)

        attack_pred, pooled_output_prj_source = model(**inputs)
        loss_s_attack = loss_fn_attack_classifier(attack_pred, inputs["labels"])

        loss = loss_s_attack 
        loss.backward()
        optimizer.step()
    print("Epoch: " + str(epoch_idx))
print("Training time:", time.time()-start_time)

In [None]:
evaluate(model)