In [1]:
import pandas as pd
import numpy as np
from pytorch_pretrained_bert import BertTokenizer
import random
import torch
from tqdm.notebook import tqdm
from transformers import (
    AutoConfig,
    AutoModelForMaskedLM,
)
from torch import nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

In [2]:
def dataPreprocessFromCSV(filename, input_ids, input_types, input_masks, label, task_label, task='Phish'):
    pad_size = 200
    tokenizer = BertTokenizer("path_to_the_vocab")  # Initialize the tokenizer
    data = pd.read_csv(filename, encoding='utf-8')
    for i, row in tqdm(data.iterrows(), total=len(data)):
        x1 = row['url']  # Replace with the column name in your CSV file where the text data is located
        x1 = tokenizer.tokenize(x1)
        tokens = ["[CLS]"] + x1 + ["[SEP]"]
        # Get input_id, seg_id, att_mask
        ids = tokenizer.convert_tokens_to_ids(tokens)
        types = [0] * (len(ids))
        masks = [1] * len(ids)
        # Pad if short, truncate if long
        if len(ids) < pad_size:
            types = types + [1] * (pad_size - len(ids))  # Set segment to 1 for the masked part
            masks = masks + [0] * (pad_size - len(ids))
            ids = ids + [0] * (pad_size - len(ids))
        else:
            types = types[:pad_size]
            masks = masks[:pad_size]
            ids = ids[:pad_size]
        input_ids.append(ids)
        input_types.append(types)
        input_masks.append(masks)
        assert len(ids) == len(masks) == len(types) == pad_size
        if task == 'Phish':
            y = row['label']
            if y == 'malicious':
                label.append([1])
                task_label.append([0])
            elif y == 'benign':
                label.append([0])
                task_label.append([0])
        elif task == 'Multi':
            y = row['label']
            if y == 'Games':
                label.append([0])
                task_label.append([1])
            elif y == 'Health':
                label.append([1])
                task_label.append([1])
            elif y == 'Kids':
                label.append([2])
                task_label.append([1])
            elif y == 'Reference':
                label.append([3])
                task_label.append([1])
            elif y == 'Shopping':
                label.append([4])
                task_label.append([1])
        elif task == 'Advertise':
            y = row['label']
            if y == 'white':
                label.append([0])
                task_label.append([2])
            elif y == 'advertise':
                label.append([1])
                task_label.append([2])

In [3]:
input_ids = {"Phish":[], "Multi":[], "Advertise":[]}  # input char ids
input_types = {"Phish":[], "Multi":[], "Advertise":[]}  # segment ids
input_masks = {"Phish":[], "Multi":[], "Advertise":[]}  # attention mask
label = {"Phish":[], "Multi":[], "Advertise":[]}
task_label = {"Phish":[], "Multi":[], "Advertise":[]}
dataset_root = {"Phish":"path_to_the_dataset", "Multi":"path_to_the_dataset", "Advertise":"path_to_the_dataset"}

dataPreprocessFromCSV(dataset_root["Phish"], input_ids["Phish"], input_types["Phish"], input_masks["Phish"], label["Phish"], task_label["Phish"], 'Phish')
dataPreprocessFromCSV(dataset_root["Multi"], input_ids["Multi"], input_types["Multi"], input_masks["Multi"], label["Multi"], task_label["Multi"], 'Multi')
dataPreprocessFromCSV(dataset_root["Advertise"], input_ids["Advertise"], input_types["Advertise"], input_masks["Advertise"], label["Advertise"], task_label["Advertise"], 'Advertise')

  0%|          | 0/639999 [00:00<?, ?it/s]

  0%|          | 0/316273 [00:00<?, ?it/s]

  0%|          | 0/47060 [00:00<?, ?it/s]

In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [5]:
def spiltDatast_bert(input_ids, input_types, input_masks, label, task_label):
    # Randomly shuffle the indices
    random_order = list(range(len(input_ids)))
    np.random.seed(2024)  # Fix the seed
    np.random.shuffle(random_order)
    print(random_order[:10])

    # Split the dataset into 80% training and 20% testing
    input_ids_train = np.array([input_ids[i] for i in random_order[:int(len(input_ids) * 0.8)]])
    input_types_train = np.array([input_types[i] for i in random_order[:int(len(input_ids) * 0.8)]])
    input_masks_train = np.array([input_masks[i] for i in random_order[:int(len(input_ids) * 0.8)]])
    y_train = np.array([label[i] for i in random_order[:int(len(input_ids) * 0.8)]])
    task_train = np.array([task_label[i] for i in random_order[:int(len(input_ids) * 0.8)]])
    print("input_ids_train.shape:" + str(input_ids_train.shape))
    print("input_types_train.shape:" + str(input_types_train.shape))
    print("input_masks_train.shape:" + str(input_masks_train.shape))
    print("y_train.shape:" + str(y_train.shape))
    print("task_train.shape:" + str(task_train.shape))

    input_ids_test = np.array([input_ids[i] for i in random_order[int(len(input_ids) * 0.8):]])
    input_types_test = np.array([input_types[i] for i in random_order[int(len(input_ids) * 0.8):]])
    input_masks_test = np.array([input_masks[i] for i in random_order[int(len(input_ids) * 0.8):]])
    y_test = np.array([label[i] for i in random_order[int(len(input_ids) * 0.8):]])
    task_test = np.array([task_label[i] for i in random_order[int(len(input_ids) * 0.8):]])
    print("input_ids_test.shape:" + str(input_ids_test.shape))
    print("input_types_test.shape:" + str(input_types_test.shape))
    print("input_masks_test.shape:" + str(input_masks_test.shape))
    print("y_test.shape:" + str(y_test.shape))
    print("task_test.shape:" + str(task_test.shape))

    return input_ids_train, input_types_train, input_masks_train, y_train, task_train, input_ids_test, input_types_test, input_masks_test, y_test, task_test

In [6]:
input_ids_train = {"Phish":[], "Multi":[], "Advertise":[]}
input_types_train = {"Phish":[], "Multi":[], "Advertise":[]}
input_masks_train = {"Phish":[], "Multi":[], "Advertise":[]}
y_train = {"Phish":[], "Multi":[], "Advertise":[]}
task_train = {"Phish":[], "Multi":[], "Advertise":[]}
input_ids_val = {"Phish":[], "Multi":[], "Advertise":[]}
input_types_val = {"Phish":[], "Multi":[], "Advertise":[]}
input_masks_val = {"Phish":[], "Multi":[], "Advertise":[]}
y_val = {"Phish":[], "Multi":[], "Advertise":[]}
task_val = {"Phish":[], "Multi":[], "Advertise":[]}

input_ids_train["Phish"], input_types_train["Phish"], input_masks_train["Phish"], y_train["Phish"], task_train["Phish"], input_ids_val["Phish"], input_types_val["Phish"], input_masks_val["Phish"], y_val["Phish"], task_val["Phish"] = spiltDatast_bert(
    input_ids["Phish"], input_types["Phish"], input_masks["Phish"], label["Phish"], task_label["Phish"])

input_ids_train["Multi"], input_types_train["Multi"], input_masks_train["Multi"], y_train["Multi"], task_train["Multi"], input_ids_val["Multi"], input_types_val["Multi"], input_masks_val["Multi"], y_val["Multi"], task_val["Multi"] = spiltDatast_bert(
    input_ids["Multi"], input_types["Multi"], input_masks["Multi"], label["Multi"], task_label["Multi"])

input_ids_train["Advertise"], input_types_train["Advertise"], input_masks_train["Advertise"], y_train["Advertise"], task_train["Advertise"], input_ids_val["Advertise"], input_types_val["Advertise"], input_masks_val["Advertise"], y_val["Advertise"], task_val["Advertise"] = spiltDatast_bert(
    input_ids["Advertise"], input_types["Advertise"], input_masks["Advertise"], label["Advertise"], task_label["Advertise"])

[148401, 209465, 456, 205277, 265839, 410052, 504627, 261678, 179483, 12326]
input_ids_train.shape:(511999, 200)
input_types_train.shape:(511999, 200)
input_masks_train.shape:(511999, 200)
y_train.shape:(511999, 1)
task_train.shape:(511999, 1)
input_ids_test.shape:(128000, 200)
input_types_test.shape:(128000, 200)
input_masks_test.shape:(128000, 200)
y_test.shape:(128000, 1)
task_test.shape:(128000, 1)
[142138, 126549, 271444, 181968, 282010, 101878, 31131, 165811, 70667, 191902]
input_ids_train.shape:(253018, 200)
input_types_train.shape:(253018, 200)
input_masks_train.shape:(253018, 200)
y_train.shape:(253018, 1)
task_train.shape:(253018, 1)
input_ids_test.shape:(63255, 200)
input_types_test.shape:(63255, 200)
input_masks_test.shape:(63255, 200)
y_test.shape:(63255, 1)
task_test.shape:(63255, 1)
[41297, 28631, 7067, 32129, 43874, 13286, 7760, 39658, 45347, 930]
input_ids_train.shape:(37648, 200)
input_types_train.shape:(37648, 200)
input_masks_train.shape:(37648, 200)
y_train.shape:(

In [7]:
for i in input_ids_train:
    if i == "Advertise":
        input_ids_train[i] = input_ids_train[i][:len(input_ids_train["Advertise"])]
    else:
        input_ids_train[i] = input_ids_train[i][:len(input_ids_train["Advertise"]) * 3]
for i in input_types_train:
    if i == "Advertise":
        input_types_train[i] = input_types_train[i][:len(input_ids_train["Advertise"])]
    else:
        input_types_train[i] = input_types_train[i][:len(input_ids_train["Advertise"]) * 3]
for i in input_masks_train:
    if i == "Advertise":
        input_masks_train[i] = input_masks_train[i][:len(input_ids_train["Advertise"])]
    else:
        input_masks_train[i] = input_masks_train[i][:len(input_ids_train["Advertise"]) * 3]
for i in y_train:
    if i == "Advertise":
        y_train[i] = y_train[i][:len(input_ids_train["Advertise"])]
    else:
        y_train[i] = y_train[i][:len(input_ids_train["Advertise"]) * 3]
for i in task_train:
    if i == "Advertise":
        task_train[i] = task_train[i][:len(input_ids_train["Advertise"])]
    else:
        task_train[i] = task_train[i][:len(input_ids_train["Advertise"]) * 3]
for i in input_ids_val:
    if i == "Advertise":
        input_ids_val[i] = input_ids_val[i][:len(input_ids_val["Advertise"])]
    else:
        input_ids_val[i] = input_ids_val[i][:len(input_ids_val["Advertise"]) * 3]
for i in input_types_val:
    if i == "Advertise":
        input_types_val[i] = input_types_val[i][:len(input_ids_val["Advertise"])]
    else:
        input_types_val[i] = input_types_val[i][:len(input_ids_val["Advertise"]) * 3]
for i in input_masks_val:
    if i == "Advertise":
        input_masks_val[i] = input_masks_val[i][:len(input_ids_val["Advertise"])]
    else:
        input_masks_val[i] = input_masks_val[i][:len(input_ids_val["Advertise"]) * 3]
for i in y_val:
    if i == "Advertise":
        y_val[i] = y_val[i][:len(input_ids_val["Advertise"])]
    else:
        y_val[i] = y_val[i][:len(input_ids_val["Advertise"]) * 3]
for i in task_val:
    if i == "Advertise":
        task_val[i] = task_val[i][:len(input_ids_val["Advertise"])]
    else:
        task_val[i] = task_val[i][:len(input_ids_val["Advertise"]) * 3]

In [8]:
BATCH_SIZE = 64

def reshapeArray(input_ids:dict, input_types:dict, input_masks:dict, y:dict, task:dict):
    index = [int(len(input_ids["Phish"])/BATCH_SIZE) * BATCH_SIZE, int(len(input_ids["Multi"])/BATCH_SIZE) * BATCH_SIZE, int(len(input_ids["Advertise"])/BATCH_SIZE) * BATCH_SIZE]
    input_ids["Phish"] = input_ids["Phish"][:index[0]]
    input_ids["Multi"] = input_ids["Multi"][:index[1]]
    input_ids["Advertise"] = input_ids["Advertise"][:index[2]]

    input_types["Phish"] = input_types["Phish"][:index[0]]
    input_types["Multi"] = input_types["Multi"][:index[1]]
    input_types["Advertise"] = input_types["Advertise"][:index[2]]

    input_masks["Phish"] = input_masks["Phish"][:index[0]]
    input_masks["Multi"] = input_masks["Multi"][:index[1]]
    input_masks["Advertise"] = input_masks["Advertise"][:index[2]]

    y["Phish"] = y["Phish"][:index[0]]
    y["Multi"] = y["Multi"][:index[1]]
    y["Advertise"] = y["Advertise"][:index[2]]

    task["Phish"] = task["Phish"][:index[0]]
    task["Multi"] = task["Multi"][:index[1]]
    task["Advertise"] = task["Advertise"][:index[2]]

    return input_ids, input_types, input_masks, y, task

In [9]:
input_ids_train, input_types_train, input_masks_train, y_train, task_train = reshapeArray(input_ids_train, input_types_train, input_masks_train, y_train, task_train)
input_ids_val, input_types_val, input_masks_val, y_val, task_val = reshapeArray(input_ids_val, input_types_val, input_masks_val, y_val, task_val)

In [10]:
print(input_ids_train["Multi"].shape)
print(y_train["Advertise"].shape)
print(input_ids_train["Phish"].shape)

(112896, 200)
(37632, 1)
(112896, 200)


In [11]:
BATCH_NUM_TASK_TRAIN = int(input_ids_train["Multi"].shape[0]/BATCH_SIZE)
BATCH_NUM_TASK_VAL = int(input_ids_val["Multi"].shape[0]/BATCH_SIZE)
BATCH_NUM_ADV_TRAIN = int(input_ids_train["Advertise"].shape[0]/BATCH_SIZE)
BATCH_NUM_ADV_VAL = int(input_ids_val["Advertise"].shape[0]/BATCH_SIZE)
print(BATCH_NUM_TASK_TRAIN)
print(BATCH_NUM_TASK_VAL)

1764
441


In [12]:
for i in input_ids_train:
    if i == "Advertise":
        input_ids_train[i] = input_ids_train[i].reshape((BATCH_NUM_ADV_TRAIN, BATCH_SIZE, 200))
    else:
        input_ids_train[i] = input_ids_train[i].reshape((BATCH_NUM_TASK_TRAIN, BATCH_SIZE, 200))
for i in input_types_train:
    if i == "Advertise":
        input_types_train[i] = input_types_train[i].reshape((BATCH_NUM_ADV_TRAIN, BATCH_SIZE, 200))
    else:
        input_types_train[i] = input_types_train[i].reshape((BATCH_NUM_TASK_TRAIN, BATCH_SIZE, 200))
for i in input_masks_train:
    if i == "Advertise":
        input_masks_train[i] = input_masks_train[i].reshape((BATCH_NUM_ADV_TRAIN, BATCH_SIZE, 200))
    else:
        input_masks_train[i] = input_masks_train[i].reshape((BATCH_NUM_TASK_TRAIN, BATCH_SIZE, 200))
for i in y_train:
    if i == "Advertise":
        y_train[i] = y_train[i].reshape((BATCH_NUM_ADV_TRAIN, BATCH_SIZE, 1))
    else:
        y_train[i] = y_train[i].reshape((BATCH_NUM_TASK_TRAIN, BATCH_SIZE, 1))
for i in task_train:
    if i == "Advertise":
        task_train[i] = task_train[i].reshape((BATCH_NUM_ADV_TRAIN, BATCH_SIZE, 1))
    else:
        task_train[i] = task_train[i].reshape((BATCH_NUM_TASK_TRAIN, BATCH_SIZE, 1))
for i in input_ids_val:
    if i == "Advertise":
        input_ids_val[i] = input_ids_val[i].reshape((BATCH_NUM_ADV_VAL, BATCH_SIZE, 200))
    else:
        input_ids_val[i] = input_ids_val[i].reshape((BATCH_NUM_TASK_VAL, BATCH_SIZE, 200))
for i in input_types_val:
    if i == "Advertise":
        input_types_val[i] = input_types_val[i].reshape((BATCH_NUM_ADV_VAL, BATCH_SIZE, 200))
    else:
        input_types_val[i] = input_types_val[i].reshape((BATCH_NUM_TASK_VAL, BATCH_SIZE, 200))
for i in input_masks_val:
    if i == "Advertise":
        input_masks_val[i] = input_masks_val[i].reshape((BATCH_NUM_ADV_VAL, BATCH_SIZE, 200))
    else:
        input_masks_val[i] = input_masks_val[i].reshape((BATCH_NUM_TASK_VAL, BATCH_SIZE, 200))
for i in y_val:
    if i == "Advertise":
        y_val[i] = y_val[i].reshape((BATCH_NUM_ADV_VAL, BATCH_SIZE, 1))
    else:
        y_val[i] = y_val[i].reshape((BATCH_NUM_TASK_VAL, BATCH_SIZE, 1))
for i in task_val:
    if i == "Advertise":
        task_val[i] = task_val[i].reshape((BATCH_NUM_ADV_VAL, BATCH_SIZE, 1))
    else:
        task_val[i] = task_val[i].reshape((BATCH_NUM_TASK_VAL, BATCH_SIZE, 1))

In [13]:
for i in input_ids_train:
    print(input_ids_train[i].shape)

(1764, 64, 200)
(1764, 64, 200)
(588, 64, 200)


In [14]:
input_ids_mtl_train = np.concatenate(tuple([input_ids_train[i] for i in input_ids_train]),axis=0)
input_types_mtl_train = np.concatenate(tuple([input_types_train[i] for i in input_types_train]), axis=0)
input_masks_mtl_train = np.concatenate(tuple([input_masks_train[i] for i in input_masks_train]), axis=0)
y_mtl_train = np.concatenate(tuple([y_train[i] for i in y_train]), axis=0)
task_mtl_train = np.concatenate(tuple([task_train[i] for i in task_train]), axis=0)
input_ids_mtl_val = np.concatenate(tuple([input_ids_val[i] for i in input_ids_val]), axis=0)
input_types_mtl_val = np.concatenate(tuple([input_types_val[i] for i in input_types_val]), axis=0)
input_masks_mtl_val = np.concatenate(tuple([input_masks_val[i] for i in input_masks_val]), axis=0)
y_mtl_val = np.concatenate(tuple([y_val[i] for i in y_val]), axis=0)
task_mtl_val = np.concatenate(tuple([task_val[i] for i in task_val]), axis=0)

In [15]:
np.random.seed(2024)  # Fix the seed
random_order = np.arange(len(input_ids_mtl_train))
np.random.shuffle(random_order)
print(random_order[:10])
input_ids_mtl_train = input_ids_mtl_train[random_order]
input_types_mtl_train = input_types_mtl_train[random_order]
input_masks_mtl_train = input_masks_mtl_train[random_order]
y_mtl_train = y_mtl_train[random_order]
task_mtl_train = task_mtl_train[random_order]

random_order = np.arange(len(input_ids_mtl_val))
np.random.shuffle(random_order)
print(random_order[:10])
input_ids_mtl_val = input_ids_mtl_val[random_order]
input_types_mtl_val = input_types_mtl_val[random_order]
input_masks_mtl_val = input_masks_mtl_val[random_order]
y_mtl_val = y_mtl_val[random_order]
task_mtl_val = task_mtl_val[random_order]

print(input_ids_mtl_train.shape)

[1514 2197 1192 2495   69 2323 1519  502 4093 3516]
[803 666 428  62 525 489 687 329 173 186]
(4116, 64, 200)


In [17]:
input_ids_mtl_train = input_ids_mtl_train.reshape(BATCH_NUM_TASK_TRAIN * BATCH_SIZE * 2 + BATCH_NUM_ADV_TRAIN * BATCH_SIZE, 200)
input_types_mtl_train = input_types_mtl_train.reshape(BATCH_NUM_TASK_TRAIN * BATCH_SIZE * 2 + BATCH_NUM_ADV_TRAIN * BATCH_SIZE, 200)
input_masks_mtl_train = input_masks_mtl_train.reshape(BATCH_NUM_TASK_TRAIN * BATCH_SIZE * 2 + BATCH_NUM_ADV_TRAIN * BATCH_SIZE, 200)
y_mtl_train = y_mtl_train.reshape(BATCH_NUM_TASK_TRAIN * BATCH_SIZE * 2 + BATCH_NUM_ADV_TRAIN * BATCH_SIZE, 1)
task_mtl_train = task_mtl_train.reshape(BATCH_NUM_TASK_TRAIN * BATCH_SIZE * 2 + BATCH_NUM_ADV_TRAIN * BATCH_SIZE, 1)
input_ids_mtl_val = input_ids_mtl_val.reshape(BATCH_NUM_TASK_VAL * BATCH_SIZE * 2 + BATCH_NUM_ADV_VAL * BATCH_SIZE, 200)
input_types_mtl_val = input_types_mtl_val.reshape(BATCH_NUM_TASK_VAL * BATCH_SIZE * 2 + BATCH_NUM_ADV_VAL * BATCH_SIZE, 200)
input_masks_mtl_val = input_masks_mtl_val.reshape(BATCH_NUM_TASK_VAL * BATCH_SIZE * 2 + BATCH_NUM_ADV_VAL * BATCH_SIZE, 200)
y_mtl_val = y_mtl_val.reshape(BATCH_NUM_TASK_VAL * BATCH_SIZE * 2 + BATCH_NUM_ADV_VAL * BATCH_SIZE, 1)
task_mtl_val = task_mtl_val.reshape(BATCH_NUM_TASK_VAL * BATCH_SIZE * 2 + BATCH_NUM_ADV_VAL * BATCH_SIZE, 1)

In [18]:
print(input_ids_mtl_train.shape)
print(task_mtl_val.shape)

(263424, 200)
(65856, 1)


In [19]:
train_data = TensorDataset(torch.tensor(input_ids_mtl_train).to(DEVICE),
                           torch.tensor(input_types_mtl_train).to(DEVICE),
                           torch.tensor(input_masks_mtl_train).to(DEVICE),
                           torch.tensor(y_mtl_train).to(DEVICE),
                           torch.tensor(task_mtl_train).to(DEVICE))
train_sampler = SequentialSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

val_data = TensorDataset(torch.tensor(input_ids_mtl_val).to(DEVICE),
                         torch.tensor(input_types_mtl_val).to(DEVICE),
                         torch.tensor(input_masks_mtl_val).to(DEVICE),
                         torch.tensor(y_mtl_val).to(DEVICE),
                         torch.tensor(task_mtl_val).to(DEVICE))
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [20]:
config_kwargs = {
    "cache_dir": None,
    "revision": 'main',
    "use_auth_token": None,
    "hidden_dropout_prob": 0.1,
    "vocab_size": 5000,
}

config = AutoConfig.from_pretrained("path_to_the_config", **config_kwargs)
print(config)

bert_model = AutoModelForMaskedLM.from_config(
    config=config,
)
bert_model.resize_token_embeddings(config_kwargs["vocab_size"])
print(bert_model)

BertConfig {
  "_name_or_path": "./bert_model/bert_config/",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.36.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 5000
}

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(5000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=F

In [21]:
bert_dict = torch.load("path_to_the_model", map_location='cpu')
bert_model.load_state_dict(bert_dict)

<All keys matched successfully>

In [22]:
class BertForSequenceClassification(nn.Module):
    def __init__(self, bert):
        super(BertForSequenceClassification, self).__init__()
        self.bert = bert
        for name, param in self.bert.named_parameters():
            param.requires_grad = True
        self.dropout = nn.Dropout(p=0.1)
        self.classifier_0 = nn.Linear(768, 2)
        self.classifier_1 = nn.Linear(768, 5)
        self.classifier_2 = nn.Linear(768, 2)

    def forward(self, x):
        context = x[0]
        types = x[1]
        mask = x[2]
        task = x[3]
        outputs = self.bert(context, attention_mask=mask, token_type_ids=types, output_hidden_states=True)
        hidden_states = outputs.hidden_states[-1][:,0,:]
        out = self.dropout(hidden_states)
        if task[0][0] == 0 and torch.all(task == task[0]):
            out = self.classifier_0(out)
        elif task[0][0] == 1 and torch.all(task == task[0]):
            out = self.classifier_1(out)
        elif task[0][0] == 2 and torch.all(task == task[0]):
            out = self.classifier_2(out)
        else:
            print("something wrong with the data")
        return out

In [23]:
model = BertForSequenceClassification(bert_model)
model.bert.cls = nn.Sequential()
model.to(DEVICE)

BertForSequenceClassification(
  (bert): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(5000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, 

In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)

In [25]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (x1, x2, x3, y, task) in enumerate(train_loader):
        x1, x2, x3, y, task = x1.to(device), x2.to(device), x3.to(device), y.to(device), task.to(device)
        assert torch.all(task == task[0])
        y_pred = model([x1, x2, x3, task])
        model.zero_grad()

        loss = F.cross_entropy(y_pred, y.squeeze())
        loss.backward()
        optimizer.step()

        if task[0][0] == 0 and torch.all(task == task[0]):
            task_name = "Phish"
        elif task[0][0] == 1 and torch.all(task == task[0]):
            task_name = "Multi"
        elif task[0][0] == 2 and torch.all(task == task[0]):
            task_name = "Advertise"
        else:
            task_name = "None"

        if (batch_idx + 1) % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.2f}%)]\t Loss: {:.6f} Task: {}'.format(epoch, (batch_idx + 1) * len(x1),
                                                                            len(train_loader.dataset),
                                                                            100. * batch_idx / len(train_loader),
                                                                            loss.item(), task_name))

In [26]:
def validation(model, device, test_loader):
    """
    Perform model validation on the test data.

    :param model: The model to be validated.
    :param device: The device to run validation on (e.g., CPU or GPU).
    :param test_loader: The data loader for test data.
    :return: A tuple containing accuracy, precision, recall, and F1 score.
    """
    model.eval()
    test_loss = [0.0, 0.0, 0.0]
    test_len = [0, 0, 0]
    task_name = ["Phish", "Multi", "Advertise"]
    y_true = {"Phish":[], "Multi":[], "Advertise":[]}
    y_pred = {"Phish":[], "Multi":[], "Advertise":[]}

    for batch_idx, (x1, x2, x3, y, task) in enumerate(test_loader):
        x1, x2, x3, y, task = x1.to(device), x2.to(device), x3.to(device), y.to(device), task.to(device)
        assert torch.all(task == task[0])

        with torch.no_grad():
            y_ = model([x1, x2, x3, task])

        if task[0][0] == 0 and torch.all(task == task[0]):
            test_loss[0] += F.cross_entropy(y_, y.squeeze()).item()
            test_len[0] += 1
        if task[0][0] == 1 and torch.all(task == task[0]):
            test_loss[1] += F.cross_entropy(y_, y.squeeze()).item()
            test_len[1] += 1
        if task[0][0] == 2 and torch.all(task == task[0]):
            test_loss[2] += F.cross_entropy(y_, y.squeeze()).item()
            test_len[2] += 1

        pred = y_.max(-1, keepdim=True)[1]  # .max(): 2 outputs, representing the maximum value and its index

        if task[0][0] == 0 and torch.all(task == task[0]):
            y_true["Phish"].extend(y.cpu().numpy())
            y_pred["Phish"].extend(pred.cpu().numpy())
        elif task[0][0] == 1 and torch.all(task == task[0]):
            y_true["Multi"].extend(y.cpu().numpy())
            y_pred["Multi"].extend(pred.cpu().numpy())
        elif task[0][0] == 2 and torch.all(task == task[0]):
            y_true["Advertise"].extend(y.cpu().numpy())
            y_pred["Advertise"].extend(pred.cpu().numpy())

    test_loss[0] /= test_len[0]
    test_loss[1] /= test_len[1]
    test_loss[2] /= test_len[2]

    accuracy = {"Phish":0.0, "Multi":0.0, "Advertise":0.0}
    precision = {"Phish":0.0, "Multi":0.0, "Advertise":0.0}
    recall = {"Phish":0.0, "Multi":0.0, "Advertise":0.0}
    f1 = {"Phish":0.0, "Multi":0.0, "Advertise":0.0}

    for i in accuracy:
        accuracy[i] = accuracy_score(y_true[i], y_pred[i])
    for i in precision:
        precision[i] = precision_score(y_true[i], y_pred[i], average='macro')
    for i in recall:
        recall[i] = recall_score(y_true[i], y_pred[i], average='macro')
    for i in f1:
        f1[i] = f1_score(y_true[i], y_pred[i], average='macro')

    print('Test set: Task: {} Average loss: {:.4f}, Accuracy: {:.2f}%, Precision: {:.2f}%, Recall: {:.2f}%, F1: {:.2f}%'.format(
        task_name[0], test_loss[0], accuracy["Phish"] * 100, precision["Phish"] * 100, recall["Phish"] * 100, f1["Phish"] * 100))

    print('Test set: Task: {} Average loss: {:.4f}, Accuracy: {:.2f}%, Precision: {:.2f}%, Recall: {:.2f}%, F1: {:.2f}%'.format(
        task_name[1], test_loss[1], accuracy["Multi"] * 100, precision["Multi"] * 100, recall["Multi"] * 100, f1["Multi"] * 100))

    print('Test set: Task: {} Average loss: {:.4f}, Accuracy: {:.2f}%, Precision: {:.2f}%, Recall: {:.2f}%, F1: {:.2f}%'.format(
        task_name[2], test_loss[2], accuracy["Advertise"] * 100, precision["Advertise"] * 100, recall["Advertise"] * 100, f1["Phish"] * 100))

    return accuracy, precision, recall, f1

In [27]:
torch.cuda.empty_cache()

In [None]:
best_acc = {"Phish":0.0, "Multi":0.0, "Advertise":0.0}
NUM_EPOCHS = 20
for epoch in range(1, NUM_EPOCHS + 1):
    train(model, DEVICE, train_loader, optimizer, epoch)
    acc, precision, recall, f1 = validation(model, DEVICE, val_loader)
    if best_acc["Phish"] < acc["Phish"]:
        task_name = "Phish"
        PATH = '/hy-tmp/modelx_MTL_BERT_{}.pth'.format(task_name)
        best_acc["Phish"] = acc["Phish"]
        torch.save(model.state_dict(), PATH)
    if best_acc["Multi"] < acc["Multi"]:
        task_name = "Multi"
        PATH = '/hy-tmp/modelx_MTL_BERT_{}.pth'.format(task_name)
        best_acc["Multi"] = acc["Multi"]
        torch.save(model.state_dict(), PATH)
    if best_acc["Advertise"] < acc["Advertise"]:
        task_name = "Advertise"
        PATH = '/hy-tmp/modelx_MTL_BERT_{}.pth'.format(task_name)
        best_acc["Advertise"] = acc["Advertise"]
        torch.save(model.state_dict(), PATH)
    print("Phish: acc is: {:.4f}, best acc is {:.4f}\n".format(acc["Phish"], best_acc["Phish"]))
    print("Multi: acc is: {:.4f}, best acc is {:.4f}\n".format(acc["Multi"], best_acc["Multi"]))
    print("Advertise: acc is: {:.4f}, best acc is {:.4f}\n".format(acc["Advertise"], best_acc["Advertise"]))

Test set: Task: Phish Average loss: 0.1811, Accuracy: 92.98%, Precision: 93.03%, Recall: 92.97%, F1: 92.97%
Test set: Task: Multi Average loss: 1.0007, Accuracy: 63.02%, Precision: 60.04%, Recall: 58.17%, F1: 58.11%
Test set: Task: Advertise Average loss: 0.0355, Accuracy: 98.94%, Precision: 98.94%, Recall: 98.94%, F1: 92.97%
Phish: acc is: 0.9298, best acc is 0.9298

Multi: acc is: 0.6302, best acc is 0.6302

Advertise: acc is: 0.9894, best acc is 0.9894

Test set: Task: Phish Average loss: 0.1587, Accuracy: 94.19%, Precision: 94.23%, Recall: 94.17%, F1: 94.18%
Test set: Task: Multi Average loss: 0.9022, Accuracy: 67.59%, Precision: 66.69%, Recall: 63.55%, F1: 64.24%
Test set: Task: Advertise Average loss: 0.0305, Accuracy: 99.12%, Precision: 99.12%, Recall: 99.12%, F1: 94.18%
Phish: acc is: 0.9419, best acc is 0.9419

Multi: acc is: 0.6759, best acc is 0.6759

Advertise: acc is: 0.9912, best acc is 0.9912

Test set: Task: Phish Average loss: 0.1387, Accuracy: 94.84%, Precision: 94.87