In [None]:
!pip3 install -U transformers datasets evaluate accelerate
!pip3 install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [2]:
def preprocess_function(data):
    return tokenizer(data["text"], truncation=True, padding=True)

dataset = load_dataset('chathuru/cicids2018')

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

id2label = {0: "BENIGN", 1: "MALICIOUS"}
label2id = {"BENIGN": 0, "MALICIOUS": 1}

In [3]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# accuracy = evaluate.load("accuracy")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return accuracy.compute(predictions=predictions, references=labels)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, label2id=label2id)

In [14]:
training_args = TrainingArguments(
    output_dir="cicids2018-distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
trainer.save_model("cicids2018-distilbert")
trainer.push_to_hub("cicids2018-distilbert")