In [1]:
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from commit_transformer.tokenizer import build_vocab
from commit_transformer.preprosser import CommitDataset
from commit_transformer.model import CombinedModel

In [2]:
df = pd.read_json(r'../datasets/patch_db.json', encoding='utf_8_sig')
df.dropna(inplace=True)
label2id={'non-security':0,'security':1}
df = df.replace({"category": label2id})
df

train, test = train_test_split(df, test_size=0.3, random_state=42)
test, val = train_test_split(test, test_size=0.5, random_state=42)
train,_ = train_test_split(train, train_size=0.1, random_state=42)
test, val = train_test_split(test, train_size=0.1, random_state=42)
val, _ = train_test_split(val, train_size=0.1, random_state=42)

train.reset_index(inplace=True)
test.reset_index(inplace=True)
val.reset_index(inplace=True)

  df = df.replace({"category": label2id})


In [3]:
train_data = []
for index, row in train.iterrows():
  train_data.append([row['commit_message'],row['diff_code'],row['category']])

val_data = []
for index, row in val.iterrows():
  val_data.append([row['commit_message'],row['diff_code'],row['category']])
test_data = []
for index, row in test.iterrows():
  test_data.append([row['commit_message'],row['diff_code'],row['category']])

In [4]:

# # Split the dataset
# train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# # Build vocabulary
vocab = build_vocab(train_data)

# Create Datasets and DataLoaders
train_dataset = CommitDataset(train_data, vocab)
val_dataset = CommitDataset(val_data, vocab)
test_dataset = CommitDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [5]:


# Initialize the model
vocab_size = len(vocab)
embed_dim = 128  # Dimension for embeddings
num_heads = 8
hidden_dim = 512
num_layers = 6
dropout = 0.3

model = CombinedModel(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, dropout)

# Train the model
model.trainer(train_loader, val_loader, num_epochs=10)

Epoch 1/10 Loss: 0.6569: 100%|██████████| 314/314 [00:10<00:00, 31.17batch/s]
Epoch 2/10 Loss: 0.6213: 100%|██████████| 314/314 [00:10<00:00, 30.73batch/s]
Epoch 3/10 Loss: 0.6234: 100%|██████████| 314/314 [00:10<00:00, 30.82batch/s]
Epoch 4/10 Loss: 0.6155: 100%|██████████| 314/314 [00:10<00:00, 30.55batch/s]
Epoch 5/10 Loss: 0.6133: 100%|██████████| 314/314 [00:10<00:00, 30.76batch/s]
Epoch 6/10 Loss: 0.6079: 100%|██████████| 314/314 [00:10<00:00, 30.65batch/s]
Epoch 7/10 Loss: 0.6007: 100%|██████████| 314/314 [00:10<00:00, 30.25batch/s]
Epoch 8/10 Loss: 0.5933: 100%|██████████| 314/314 [00:10<00:00, 30.19batch/s]
Epoch 9/10 Loss: 0.5819: 100%|██████████| 314/314 [00:10<00:00, 29.98batch/s]
Epoch 10/10 Loss: 0.5749: 100%|██████████| 314/314 [00:10<00:00, 30.23batch/s]


Validation Accuracy: 0.6832298136645962
Precision: 0.5652173913043478
Recall: 0.3151515151515151
F1-Score: 0.4046692607003891


In [6]:
torch.save(model, "transformer_entire_bert_model.pth")

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve,classification_report

In [9]:
test_labels, test_predictions = model.evaluate(test_loader)
print(classification_report(test_labels,test_predictions))

Validation Accuracy: 0.7374301675977654
Precision: 0.6326530612244898
Recall: 0.3712574850299401
F1-Score: 0.4679245283018868
              precision    recall  f1-score   support

           0       0.76      0.90      0.83       370
           1       0.63      0.37      0.47       167

    accuracy                           0.74       537
   macro avg       0.70      0.64      0.65       537
weighted avg       0.72      0.74      0.71       537

