In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer,LabelBinarizer
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import torch

In [5]:
df = pd.read_excel("final_data.xlsx")

In [6]:
labels = [
    'Operating_System', 'Software_Component', 'Version', 'Impact',
    'Affected_Hardware', 'Network_Requirements', 'Affected_Protocols',
    'Authentication_Required', 'Privileges_Required', 'User_Interaction_Required', 'Vendor'
]


In [7]:
df[labels] = df[labels].fillna('unknown')

In [8]:
for label in labels:
    df[label] = df[label].astype(str)

In [9]:
df['labels'] = df[labels].values.tolist()

In [10]:
df

Unnamed: 0,ID,CVE-ID,CVSS-V3,CVSS-V2,SEVERITY,DESCRIPTION,CWE-ID,Operating_System,Software_Component,Version,Impact,Affected_Hardware,Network_Requirements,Affected_Protocols,Authentication_Required,Privileges_Required,User_Interaction_Required,Vendor,labels
0,1,CVE-1999-0001,,5.0,MEDIUM,ip_input.c in BSD-derived TCP/IP implementatio...,CWE-20,BSD-derived,ip_input.c,unknown,denial of service (crash or hang),unknown,remote,TCP/IP,unknown,unknown,unknown,unknown,"[BSD-derived, ip_input.c, unknown, denial of s..."
1,2,CVE-1999-0002,,10.0,HIGH,Buffer overflow in NFS mountd gives root acces...,CWE-119,Linux,NFS mountd,unknown,root access,unknown,remote,NFS,unknown,unknown,unknown,unknown,"[Linux, NFS mountd, unknown, root access, unkn..."
2,3,CVE-1999-0003,,10.0,HIGH,Execute commands as root via buffer overflow i...,NVD-CWE-Other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,"[unknown, unknown, unknown, unknown, unknown, ..."
3,4,CVE-1999-0004,,5.0,MEDIUM,"MIME buffer overflow in email clients, e.g. So...",NVD-CWE-Other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,"[unknown, unknown, unknown, unknown, unknown, ..."
4,5,CVE-1999-0005,,10.0,HIGH,Arbitrary command execution via IMAP buffer ov...,NVD-CWE-Other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,"[unknown, unknown, unknown, unknown, unknown, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11494,11495,CVE-2021-0607,7.8,4.6,HIGH,"In iaxxx_calc_i2s_div of iaxxx-codec.c, there ...",CWE-119,Android,Android kernel,unknown,local escalation of privilege,unknown,unknown,unknown,unknown,no additional execution privileges needed,unknown,unknown,"[Android, Android kernel, unknown, local escal..."
11495,11496,CVE-2021-0608,7.8,4.6,HIGH,"In handleAppLaunch of AppLaunchActivity.java, ...",CWE-610,Android,Android kernel,A-174870704,local escalation of privilege,unknown,unknown,unknown,unknown,no additional execution privileges needed,no,unknown,"[Android, Android kernel, A-174870704, local e..."
11496,11497,CVE-2021-0610,7.8,4.6,HIGH,"In memory management driver, there is a possib...",CWE-190,unknown,memory management driver,unknown,local escalation of privilege,unknown,unknown,unknown,unknown,unknown,No,unknown,"[unknown, memory management driver, unknown, l..."
11497,11498,CVE-2021-0611,7.8,4.6,HIGH,"In m4u, there is a possible memory corruption ...",CWE-416,unknown,m4u,unknown,local escalation of privilege,unknown,unknown,unknown,unknown,System execution privileges,No,unknown,"[unknown, m4u, unknown, local escalation of pr..."


In [11]:
binary_labels_list = []
for label in labels:
    lb = LabelBinarizer()
    binary_labels = lb.fit_transform(df[label])
    binary_labels_list.append(binary_labels)

# Concatenate all binary label matrices
binary_labels = np.concatenate(binary_labels_list, axis=1)

In [12]:
binary_labels = np.concatenate(binary_labels_list, axis=1)

In [13]:
print("Binary labels shape:", binary_labels.shape)  # Should be (num_samples, 11)

Binary labels shape: (11499, 13783)


In [14]:
df['binary_labels'] = list(binary_labels)

In [15]:
dataset = Dataset.from_pandas(df)

In [16]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [27]:
def preprocess_data(examples):
    text = examples['DESCRIPTION']
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    encoding["labels"] = [np.array(label, dtype=np.float32) for label in examples['binary_labels']]
    return encoding

In [28]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/11499 [00:00<?, ? examples/s]

In [29]:
for label in labels:
    unique_values = df[label].unique()
    print(f"Unique values in {label}: {len(unique_values)}")

Unique values in Operating_System: 219
Unique values in Software_Component: 5653
Unique values in Version: 4118
Unique values in Impact: 2127
Unique values in Affected_Hardware: 187
Unique values in Network_Requirements: 34
Unique values in Affected_Protocols: 277
Unique values in Authentication_Required: 6
Unique values in Privileges_Required: 142
Unique values in User_Interaction_Required: 16
Unique values in Vendor: 1004


In [30]:
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="multi_label_classification",
    num_labels=binary_labels.shape[1]  # Ensure this matches the number of binary columns
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
args = TrainingArguments(
    "bert-finetuned-description",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)




In [33]:
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(preds))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1
    y_true = p.label_ids
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, probs, average='micro')
    accuracy = accuracy_score(y_true, y_pred)
    return {'f1': f1_micro_average, 'roc_auc': roc_auc, 'accuracy': accuracy}

In [34]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0182,0.00961,0.618772,0.901559,0.0
2,0.0061,0.004593,0.618772,0.910594,0.0
3,0.004,0.003432,0.641884,0.914364,0.0
4,0.0031,0.003021,0.641884,0.915721,0.0
5,0.003,0.002903,0.618772,0.915722,0.0


In [36]:
trainer.evaluate()

{'eval_loss': 0.0034321495331823826,
 'eval_f1': 0.6418840579710144,
 'eval_roc_auc': 0.9143640756345565,
 'eval_accuracy': 0.0,
 'eval_runtime': 84.3751,
 'eval_samples_per_second': 27.259,
 'eval_steps_per_second': 3.413,
 'epoch': 5.0}

In [37]:
output_dir = "nlp-bert"  # Specify your desired directory
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('nlp-bert/tokenizer_config.json',
 'nlp-bert/special_tokens_map.json',
 'nlp-bert/vocab.txt',
 'nlp-bert/added_tokens.json',
 'nlp-bert/tokenizer.json')