In [1]:
import json
import pickle
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import os
import math

In [2]:
model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.set_float32_matmul_precision('high')

dataset = load_dataset(
    "NicholasOgenstad/my-runbugrun-dataset",
    data_files="runbugrun_all_pairs_with_language.json",
    split="train"
)
dataset = dataset.filter(lambda example: example["language"] != "tests")

buggy = dataset['buggy_code']
fixed = dataset['fixed_code']

2025-07-16 11:04:24.292631: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-16 11:04:24.372332: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-16 11:04:24.372371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-16 11:04:24.375900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
def load_tokenized_chunk(save_dir, chunk_num):
    chunk_file = os.path.join(save_dir, f'chunk_{chunk_num:04d}.pkl')
    with open(chunk_file, 'rb') as f:
        chunk_data = pickle.load(f)

    current_chunk_size = chunk_data['chunk_size']
    
    buggy_tokenized = {
       'input_ids': chunk_data['input_ids'][:current_chunk_size],
       'attention_mask': chunk_data['attention_mask'][:current_chunk_size]
    }
    
    fixed_tokenized = {
       'input_ids': chunk_data['input_ids'][current_chunk_size:],
       'attention_mask': chunk_data['attention_mask'][current_chunk_size:]
    }

    return buggy_tokenized, fixed_tokenized

In [4]:
def get_mean_pooled_embeddings(input_ids, attention_mask):
    with torch.no_grad(), torch.autocast("cuda"):
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand_as(hidden).float()
        summed = (hidden * mask).sum(1)
        counts = mask.sum(1).clamp(min=1e-9)
        return summed / counts

In [5]:
def write_diff_file(chunk_num, diff_array):
    os.makedirs(output_dir, exist_ok=True)
    
    diff_embeddings = torch.cat(diff_array, dim=0)
    diff_embeddings_np = diff_embeddings.cpu().numpy()
    
    output_file = os.path.join(output_dir, f'diff_embeddings_chunk_{chunk_num:04d}.pkl')
    with open(output_file, 'wb') as f:
        pickle.dump(diff_embeddings_np, f)

In [6]:
def embed_in_batches(encodings, batch_size=192):
    total = encodings['input_ids'].shape[0]
    encodings = {k: v.to(model.device, non_blocking=True) for k, v in encodings.items()}
    pooled_outputs = []

    for  start_idx in range(0, total, batch_size):
        end_idx = min(start_idx + batch_size, total)

        input_ids_batch = encodings["input_ids"][start_idx:end_idx]
        attention_mask_batch = encodings["attention_mask"][start_idx:end_idx]
        pooled = get_mean_pooled_embeddings(input_ids_batch, attention_mask_batch)
        pooled_outputs.append(pooled)

    all_embeddings = torch.cat(pooled_outputs)

    return all_embeddings.cpu()

In [None]:
tokenized_dir = "/mimer/NOBACKUP/groups/naiss2025-5-243/tokenized_chunks2"
output_dir = "/mimer/NOBACKUP/groups/naiss2025-5-243/diff_embeddings2"

step_size = 1000

for chunk_num in range(0, 23):
    
    buggy_data, fixed_data = load_tokenized_chunk(tokenized_dir, chunk_num)
    total_size = buggy_data['input_ids'].shape[0]
    
    diff_final = []
    for batch_idx in range(20):
        print(f"Processing chunk {chunk_num}, batch {batch_idx}")
        start = batch_idx * step_size
        end = min(start + step_size, total_size)
        
        buggy_batch = {
            'input_ids': buggy_data['input_ids'][start:end],
            'attention_mask': buggy_data['attention_mask'][start:end]
        }
        fixed_batch = {
            'input_ids': fixed_data['input_ids'][start:end],
            'attention_mask': fixed_data['attention_mask'][start:end]
        }
    
        buggy = embed_in_batches(buggy_batch)
        fixed = embed_in_batches(fixed_batch)
    
        diff_batch = fixed - buggy
        diff_final.append(diff_batch)

    write_diff_file(chunk_num, diff_final)
    torch.cuda.empty_cache()

Processing chunk 16, batch 0
Processing chunk 16, batch 1
Processing chunk 16, batch 2
Processing chunk 16, batch 3
Processing chunk 16, batch 4
Processing chunk 16, batch 5
Processing chunk 16, batch 6
Processing chunk 16, batch 7
Processing chunk 16, batch 8
Processing chunk 16, batch 9
Processing chunk 16, batch 10
Processing chunk 16, batch 11
Processing chunk 16, batch 12
Processing chunk 16, batch 13
Processing chunk 16, batch 14
Processing chunk 16, batch 15
Processing chunk 16, batch 16
Processing chunk 16, batch 17
Processing chunk 16, batch 18
Processing chunk 16, batch 19
Saved diff embeddings for chunk 16 to /mimer/NOBACKUP/groups/naiss2025-5-243/diff_embeddings2/diff_embeddings_chunk_0016.pkl
Shape: (20000, 1024)
Processing chunk 17, batch 0
Processing chunk 17, batch 1
Processing chunk 17, batch 2
Processing chunk 17, batch 3
Processing chunk 17, batch 4
Processing chunk 17, batch 5
Processing chunk 17, batch 6
Processing chunk 17, batch 7
Processing chunk 17, batch 8
Pro