In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from commit_transformer.tokenizer import build_vocab
from commit_transformer.preprosser import CommitDataset
from commit_transformer.model import CombinedModel
import torch
from torch.utils.data import DataLoader

df = pd.read_csv(r'../datasets/dataset.csv', encoding='utf_8_sig')
df.dropna(inplace=True)
label2id={'negative':0,'positive':1}
df = df.replace({"label": label2id})
df

train, _ = train_test_split(df, train_size=0.35, random_state=42)
train, test = train_test_split(train, test_size=0.3, random_state=42)
test, val = train_test_split(test, train_size=0.5, random_state=42)

train.reset_index(inplace=True)
test.reset_index(inplace=True)
val.reset_index(inplace=True)

  df = df.replace({"label": label2id})


In [2]:
# df = df[['message','diff','label']] 
# df

In [3]:
train_data = []
for index, row in train.iterrows():
  train_data.append([row['message'],row['diff'],row['label']])

val_data = []
for index, row in val.iterrows():
  val_data.append([row['message'],row['diff'],row['label']])
test_data = []
for index, row in test.iterrows():
  test_data.append([row['message'],row['diff'],row['label']])

In [4]:

# # Split the dataset
# train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# # Build vocabulary
vocab = build_vocab(train_data)

# Create Datasets and DataLoaders
train_dataset = CommitDataset(train_data, vocab)
val_dataset = CommitDataset(val_data, vocab)
test_dataset = CommitDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [5]:


# Initialize the model
vocab_size = len(vocab)
embed_dim = 128  # Dimension for embeddings
num_heads = 8
hidden_dim = 512
num_layers = 6
dropout = 0.3

model = CombinedModel(vocab_size, embed_dim, num_heads, hidden_dim, num_layers, dropout)

# Train the model
model.trainer(train_loader, val_loader, num_epochs=10)

Epoch 1/10 Loss: 0.6664: 100%|██████████| 309/309 [00:13<00:00, 23.74batch/s]
Epoch 2/10 Loss: 0.6451: 100%|██████████| 309/309 [00:12<00:00, 24.36batch/s]
Epoch 3/10 Loss: 0.6373: 100%|██████████| 309/309 [00:12<00:00, 24.38batch/s]
Epoch 4/10 Loss: 0.6288: 100%|██████████| 309/309 [00:12<00:00, 24.17batch/s]
Epoch 5/10 Loss: 0.6234: 100%|██████████| 309/309 [00:12<00:00, 23.94batch/s]
Epoch 6/10 Loss: 0.6152: 100%|██████████| 309/309 [00:12<00:00, 24.09batch/s]
Epoch 7/10 Loss: 0.6010: 100%|██████████| 309/309 [00:12<00:00, 24.12batch/s]
Epoch 8/10 Loss: 0.5982: 100%|██████████| 309/309 [00:12<00:00, 24.33batch/s]
Epoch 9/10 Loss: 0.5901: 100%|██████████| 309/309 [00:12<00:00, 23.94batch/s]
Epoch 10/10 Loss: 0.5749: 100%|██████████| 309/309 [00:12<00:00, 24.11batch/s]


Validation Accuracy: 0.6786389413988658
Precision: 0.6298342541436464
Recall: 0.5253456221198156
F1-Score: 0.5728643216080402


In [6]:
model.evaluate(test_loader)

Validation Accuracy: 0.7140151515151515
Precision: 0.6193181818181818
Recall: 0.5647668393782384
F1-Score: 0.5907859078590786


([0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  1,
  0,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  1,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  1,
  0,
  1,
  0,
  0,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  1,
  0,
  1,


In [7]:
torch.save(model, "transformer_entire_bert_model_900repo.pth")

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve,classification_report

In [9]:
test_labels, test_predictions = model.evaluate(test_loader)
print(classification_report(test_labels,test_predictions))

Validation Accuracy: 0.7140151515151515
Precision: 0.6193181818181818
Recall: 0.5647668393782384
F1-Score: 0.5907859078590786
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       335
           1       0.62      0.56      0.59       193

    accuracy                           0.71       528
   macro avg       0.69      0.68      0.69       528
weighted avg       0.71      0.71      0.71       528

