In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from safetensors.torch import save_file
import math

In [2]:
# Model
model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

2025-06-26 13:51:41.532299: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-26 13:51:41.706590: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-26 13:51:41.706628: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-26 13:51:41.727899: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 1024, padding_idx=50283)
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=1024, out_features=3072, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=1024, out_features=1024, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=1024, out_features=5248, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=2624, out_features=1024, bias=False)
      )
    )
    (1-27): 27 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((1024,), eps

In [3]:
import json
dataset = load_dataset("NicholasOgenstad/RunBugRun", data_files="train.json")
buggy = dataset['train']['buggy_code']
fixed = dataset['train']['fixed_code']
token_count = dataset['train']['token_count']
bug_type = [json.loads(bt) for bt in dataset["train"]["bug_type"]]

In [4]:
i = 0
while i < len(buggy):
    if token_count[i] > 3000:
        token_count.pop(i)
        buggy.pop(i)
        fixed.pop(i)
        bug_type.pop(i)
    else:
        i += 1

In [10]:
import json
import pickle
import os
from datasets import load_dataset

def pretokenize_in_chunks(buggy, fixed, tokenizer, chunk_size=10000, save_dir="tokenized_chunks"):
    
    if os.path.exists(save_dir):
        print(f"Tokenized chunks already exist in {save_dir}")
        return save_dir
    
    os.makedirs(save_dir, exist_ok=True)
    total_samples = len(buggy)
    
    print(f"Tokenizing {total_samples} samples in chunks of {chunk_size}...")
    
    chunk_info = {
        'total_samples': total_samples,
        'chunk_size': chunk_size,
        'num_chunks': (total_samples + chunk_size - 1) // chunk_size
    }
    
    with open(os.path.join(save_dir, 'chunk_info.json'), 'w') as f:
        json.dump(chunk_info, f)
    
    for chunk_idx in range(0, total_samples, chunk_size):
        chunk_end = min(chunk_idx + chunk_size, total_samples)
        current_chunk_size = chunk_end - chunk_idx
        
        print(f"Tokenizing chunk {chunk_idx//chunk_size + 1}/{chunk_info['num_chunks']}: samples {chunk_idx}-{chunk_end-1}")
        
        # Get current chunk text
        buggy_chunk = buggy[chunk_idx:chunk_end]
        fixed_chunk = fixed[chunk_idx:chunk_end]
        
        # Combine buggy and fixed for this chunk
        chunk_text = list(buggy_chunk) + list(fixed_chunk)
        
        # Tokenize chunk
        chunk_encodings = tokenizer(
            chunk_text,
            padding=True,
            truncation=True,
            max_length=4000,
            return_tensors="pt"
        )
        
        chunk_data = {
            'input_ids': chunk_encodings['input_ids'],
            'attention_mask': chunk_encodings['attention_mask'],
            'chunk_size': current_chunk_size,
            'chunk_idx': chunk_idx
        }
        
        chunk_file = os.path.join(save_dir, f'chunk_{chunk_idx//chunk_size:04d}.pkl')
        with open(chunk_file, 'wb') as f:
            pickle.dump(chunk_data, f)
        
        del chunk_encodings, chunk_data, chunk_text, buggy_chunk, fixed_chunk
        gc.collect()
    
    print(f"Tokenization complete! Saved {chunk_info['num_chunks']} chunks to {save_dir}")
    return save_dir

save_path = "/mimer/NOBACKUP/groups/naiss2025-5-243/tokenized_chunks"
tokenized_dir = pretokenize_in_chunks(buggy, fixed, tokenizer, chunk_size=20000, save_dir=save_path)
print(f"Tokenized data saved to: {tokenized_dir}")

Tokenizing 714238 samples in chunks of 20000...
Tokenizing chunk 1/36: samples 0-19999


KeyboardInterrupt: 