In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [2]:
import pandas as pd
import numpy as np

In [4]:
malware_calls = pd.read_csv("../datasets/Oliveria.csv")

In [5]:
malware_calls.head()

In [6]:
malware_calls['class'].value_counts()

In [7]:
num_classes = len(malware_calls["class"].value_counts())

In [8]:
class_weights = (1 - (malware_calls['class'].value_counts().sort_index() / len(malware_calls))).values
class_weights = torch.from_numpy(class_weights).float().to("cuda")
class_weights

In [47]:
from transformers import Trainer
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_func = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [9]:
from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, BertTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=num_classes)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [49]:
tokenizer.model_max_length = 512

In [54]:
CAT2IDX = {
    'Trojan': 0,
    'Adware': 1,
    'Downloader': 2,
    'Ransomware': 3,
    'Agent': 4,
    'Riskware': 5,
    'Backdoor': 6,
    'Dropper': 7,
    'Virus': 8
}

IDX2CAT = {
    0:'Trojan',
    1:'Adware',
    2:'Downloader',
    3:'Ransomware',
    4:'Agent',
    5:'Riskware',
    6:'Backdoor',
    7:'Dropper',
    8:'Virus'
}

In [51]:
malware_calls['class'] = malware_calls['class'].map(lambda x: CAT2IDX[x])

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(malware_calls.newApis, malware_calls['class'],
test_size=0.2, random_state=75, stratify = malware_calls['class'])

In [55]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
test_size=0.2, random_state=75, stratify = Y_train)

In [56]:
def get_list_strs(df):
    lst_str = []
    for i in range(len(df)):
        str_ = df.values[i]
        lst_str.append(str_)
    return lst_str

In [57]:
train_encodings = tokenizer(get_list_strs(X_train), padding="max_length", truncation=True, return_tensors="pt")
val_encodings = tokenizer(get_list_strs(X_val), padding="max_length", truncation=True, return_tensors="pt")
test_encodings = tokenizer(get_list_strs(X_test), padding="max_length", truncation=True, return_tensors="pt")

In [58]:
class MalwareDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [59]:
trainset = MalwareDataset(train_encodings, Y_train.values)
valset = MalwareDataset(val_encodings, Y_val.values)
testset = MalwareDataset(test_encodings, Y_test.values)

In [60]:
import numpy as np
from datasets import load_metric
from sklearn.metrics import roc_auc_score
from scipy.special import softmax

def compute_metrics(eval_pred):
    precision = load_metric("precision")
    recall = load_metric("recall")
    f1 = load_metric("f1")
    acc = load_metric("accuracy")
    mcc = load_metric("matthews_correlation")
    #auc = load_metric("auc")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = precision.compute(predictions=predictions, average = "macro", references=labels)["precision"]
    recall = recall.compute(predictions=predictions, average = "macro", references=labels)["recall"]
    f1 = f1.compute(predictions=predictions, average = "macro", references=labels)["f1"]
    acc = acc.compute(predictions=predictions, references=labels)["accuracy"]
    mcc = mcc.compute(predictions=predictions, references=labels)["matthews_correlation"]
    auc = roc_auc_score(labels, softmax(logits, axis=1), multi_class='ovo', average='macro')
    return {"precision": precision, "recall": recall, "acc": acc, "mcc": mcc, "f1": f1, "auc":auc}

In [61]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    do_train=True,
    do_eval=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_auc',
    greater_is_better=True,
    #fp16=True,
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    lr_scheduler_type='cosine',
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    dataloader_num_workers=8,
    #logging_dir='./logs',            # directory for storing logs
    #logging_steps=10,
)

In [62]:
from transformers import Trainer

trainer = WeightedLossTrainer(
    model=model, args=training_args, train_dataset=trainset, eval_dataset=valset,
    compute_metrics=compute_metrics
)

In [11]:
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=testset)