In [1]:
import json
import pickle
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import os
import math

In [10]:
output_dir = "/mimer/NOBACKUP/groups/naiss2025-5-243/diff_embeddings2"
all_diffs = []

for i in range(22):
    filename = f"diff_embeddings_chunk_{i:04d}.pkl"
    filepath = os.path.join(output_dir, filename)

    with open(filepath, "rb") as f:
        chunk = pickle.load(f)
        all_diffs.extend(chunk)

Loaded 440000 total diffs.


In [11]:
dataset = load_dataset(
    "NicholasOgenstad/my-runbugrun-dataset",
    data_files="runbugrun_all_pairs_with_language.json",
    split="train"
)
dataset = dataset.filter(lambda example: example["language"] != "tests")

buggy = dataset['buggy_code']
fixed = dataset['fixed_code']

bug_label = dataset['labels']
language = dataset['language']
bug_label = bug_label[:len(all_diffs)]
language = language[:len(all_diffs)]

change_count = []
for i in bug_label:
    if i == None:
        change_count.append(0)
    else: 
        change_count.append(len(i))

In [17]:
new_diffs = []
new_change_count = []
new_bug_label = []

for i in range(len(all_diffs)):
    if change_count[i] > 15 or change_count[i] == 0:
        continue
    else:
        new_diffs.append(all_diffs[i])
        new_change_count.append(change_count[i])
        new_bug_label.append(bug_label[i])

cpp_diffs = []
cpp_change = []
cpp_bug_label = []

for i in range(len(new_diffs)):
    if language[i] == 'cpp':
        cpp_diffs.append(new_diffs[i])
        cpp_change.append(new_change_count[i])
        cpp_bug_label.append(new_bug_label[i])

In [20]:
flat = [item for sublist in cpp_bug_label for item in sublist]
unique_strings = sorted(set(flat))
string_to_int = {s: i+1 for i, s in enumerate(unique_strings)}
mapped_data = [[string_to_int[s] for s in sublist] for sublist in cpp_bug_label]

In [24]:
from collections import Counter

flat_numbers = [n for sublist in mapped_data for n in sublist]
distribution = Counter(flat_numbers)
print(distribution)

Counter({31: 54125, 14: 54036, 86: 42981, 57: 38745, 82: 34255, 103: 31218, 74: 30350, 95: 28469, 10: 25696, 42: 25621, 58: 19395, 102: 13132, 129: 13119, 105: 12281, 55: 12021, 12: 11789, 2: 10567, 56: 9970, 118: 9116, 127: 8911, 43: 8607, 68: 8295, 97: 7411, 22: 7238, 20: 6254, 125: 5964, 128: 5782, 13: 5621, 49: 5545, 83: 5513, 101: 5513, 41: 5486, 84: 4109, 100: 4109, 11: 4095, 1: 3737, 104: 3277, 120: 3229, 87: 2345, 79: 2214, 107: 2028, 61: 1538, 29: 1495, 32: 1495, 35: 1381, 109: 1372, 53: 1282, 28: 1139, 33: 1139, 59: 1130, 9: 1130, 75: 1069, 122: 920, 112: 873, 93: 870, 119: 798, 126: 794, 47: 794, 30: 772, 52: 669, 66: 637, 27: 621, 73: 574, 36: 559, 60: 551, 8: 551, 26: 511, 63: 487, 7: 432, 121: 408, 99: 398, 37: 327, 91: 285, 65: 283, 34: 244, 46: 244, 76: 222, 67: 213, 45: 212, 88: 194, 96: 189, 3: 176, 70: 176, 48: 147, 62: 144, 78: 141, 77: 112, 54: 110, 21: 105, 38: 100, 106: 98, 39: 84, 116: 83, 85: 79, 115: 62, 50: 62, 16: 55, 15: 40, 44: 38, 80: 33, 64: 31, 4: 29, 6

In [30]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

cpp_diffs = np.array(cpp_diffs)

all_labels = sorted(set(l for sample in mapped_data for l in sample))
label_to_idx = {label: idx for idx, label in enumerate(all_labels)}
idx_to_label = {idx: label for label, idx in label_to_idx.items()}
n_labels = len(label_to_idx)

n_samples = len(mapped_data)
y_multi_hot = np.zeros((n_samples, n_labels), dtype=np.float32)
for i, labels in enumerate(mapped_data):
    for label in labels:
        y_multi_hot[i, label_to_idx[label]] = 1.0

X_train, X_val, y_train, y_val = train_test_split(
    cpp_diffs, y_multi_hot, test_size=0.2, random_state=42
)

class MultiLabelDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float()
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 256


train_ds = MultiLabelDataset(X_train, y_train)
val_ds = MultiLabelDataset(X_val, y_val)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
val_dl = DataLoader(val_ds, batch_size=batch_size, num_workers=4)

class MultiLabelNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelNN(1024, n_labels).to(device)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


n_epochs = 30

for epoch in range(1, n_epochs+1):
    model.train()
    train_loss = 0.0
    all_train_preds = []
    all_train_targets = []

    for xb, yb in train_dl:
        xb = xb.to(device)
        yb = yb.to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * xb.size(0)

        all_train_preds.append(preds.detach().cpu())
        all_train_targets.append(yb.cpu())

    train_loss /= len(train_ds)
    train_preds = torch.cat(all_train_preds).numpy()
    train_targets = torch.cat(all_train_targets).numpy()

    model.eval()
    val_loss = 0.0
    all_val_preds = []
    all_val_targets = []

    with torch.no_grad():
        for xb, yb in val_dl:
            xb = xb.to(device)
            yb = yb.to(device)
            preds = model(xb)
            loss = criterion(preds, yb)
            val_loss += loss.item() * xb.size(0)

            all_val_preds.append(preds.cpu())
            all_val_targets.append(yb.cpu())

    val_loss /= len(val_ds)
    val_preds = torch.cat(all_val_preds).numpy()
    val_targets = torch.cat(all_val_targets).numpy()

    train_preds_bin = (train_preds >= 0.5).astype(int)
    val_preds_bin = (val_preds >= 0.5).astype(int)

    train_f1 = f1_score(train_targets, train_preds_bin, average='micro', zero_division=0)
    val_f1 = f1_score(val_targets, val_preds_bin, average='micro', zero_division=0)

    print(f"Epoch {epoch:02d}: "
          f"Train Loss={train_loss:.4f} F1={train_f1:.4f} "
          f"Val Loss={val_loss:.4f} F1={val_f1:.4f}")


Epoch 01: Train Loss=0.0818 F1=0.1218 Val Loss=0.0568 F1=0.2615
Epoch 02: Train Loss=0.0534 F1=0.3461 Val Loss=0.0475 F1=0.4241
Epoch 03: Train Loss=0.0471 F1=0.4587 Val Loss=0.0429 F1=0.5037
Epoch 04: Train Loss=0.0435 F1=0.5175 Val Loss=0.0401 F1=0.5465
Epoch 05: Train Loss=0.0411 F1=0.5537 Val Loss=0.0381 F1=0.5885
Epoch 06: Train Loss=0.0393 F1=0.5789 Val Loss=0.0365 F1=0.6093
Epoch 07: Train Loss=0.0378 F1=0.5963 Val Loss=0.0353 F1=0.6227
Epoch 08: Train Loss=0.0366 F1=0.6114 Val Loss=0.0343 F1=0.6363
Epoch 09: Train Loss=0.0356 F1=0.6242 Val Loss=0.0336 F1=0.6425
Epoch 10: Train Loss=0.0348 F1=0.6335 Val Loss=0.0330 F1=0.6476
Epoch 11: Train Loss=0.0341 F1=0.6426 Val Loss=0.0324 F1=0.6573
Epoch 12: Train Loss=0.0334 F1=0.6502 Val Loss=0.0319 F1=0.6624
Epoch 13: Train Loss=0.0328 F1=0.6567 Val Loss=0.0314 F1=0.6727
Epoch 14: Train Loss=0.0323 F1=0.6630 Val Loss=0.0311 F1=0.6737
Epoch 15: Train Loss=0.0319 F1=0.6676 Val Loss=0.0308 F1=0.6797
Epoch 16: Train Loss=0.0314 F1=0.6729 Va

In [31]:
model.eval()

MultiLabelNN(
  (net): Sequential(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=256, out_features=129, bias=True)
    (7): Sigmoid()
  )
)

In [38]:
xb, yb = next(iter(val_dl))
xb = xb.to(device)
with torch.no_grad():
    preds = model(xb)   # shape: (batch_size, 129)
probs = preds.cpu().numpy()
threshold = 0.4
binary_preds = (probs >= threshold).astype(int)
n_to_show = 15

true_labels = yb.cpu().numpy()

for i in range(n_to_show):
    pred_indices = np.where(binary_preds[i] == 1)[0]
    true_indices = np.where(true_labels[i] == 1)[0]
    
    pred_labels = [idx_to_label[idx] for idx in pred_indices]
    true_labels_list = [idx_to_label[idx] for idx in true_indices]
    
    print(f"Sample {i+1}")
    print(f"True Labels: {sorted(true_labels_list)}")
    print(f"Predicted :  {sorted(pred_labels)}")
    print("-"*40)


Sample 1
True Labels: [1]
Predicted :  [1]
----------------------------------------
Sample 2
True Labels: [13, 14, 86, 103]
Predicted :  [86, 103]
----------------------------------------
Sample 3
True Labels: [31, 61]
Predicted :  []
----------------------------------------
Sample 4
True Labels: [1]
Predicted :  [1]
----------------------------------------
Sample 5
True Labels: [11, 31, 82]
Predicted :  [82]
----------------------------------------
Sample 6
True Labels: [31, 74]
Predicted :  [31, 74]
----------------------------------------
Sample 7
True Labels: [31, 74]
Predicted :  [31, 74]
----------------------------------------
Sample 8
True Labels: [57, 102, 103]
Predicted :  [86, 102, 103]
----------------------------------------
Sample 9
True Labels: [10, 57, 68, 105]
Predicted :  [57, 68, 105]
----------------------------------------
Sample 10
True Labels: [10, 82]
Predicted :  [10, 57, 82, 118]
----------------------------------------
Sample 11
True Labels: [42, 55, 95]
Pred