In [1]:
from datasets import load_dataset

# Load the dataset from the Hub
dataset = load_dataset(
    "NicholasOgenstad/my-runbugrun-dataset",
    data_files="runbugrun_all_pairs_with_language.json",
    split="train"
)
dataset = dataset.filter(lambda example: example["language"] != "tests")

In [2]:
buggy = dataset['buggy_code']
fixed = dataset['fixed_code']

In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
from safetensors.torch import save_file
import math

In [4]:
# Model
model_id = "answerdotai/ModernBERT-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-07-15 23:07:14.633205: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-15 23:07:14.712246: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-07-15 23:07:14.712279: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-07-15 23:07:14.715907: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 1024, padding_idx=50283)
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=1024, out_features=3072, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=1024, out_features=1024, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=1024, out_features=5248, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=2624, out_features=1024, bias=False)
      )
    )
    (1-27): 27 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((1024,), eps

In [8]:
import json
import pickle
import os
from datasets import load_dataset
import gc

def pretokenize_in_chunks(buggy, fixed, tokenizer, chunk_size=10000, save_dir="tokenized_chunks"):
    
    os.makedirs(save_dir, exist_ok=True)
    total_samples = len(buggy)
    
    print(f"Tokenizing {total_samples} samples in chunks of {chunk_size}...")
    
    chunk_info = {
        'total_samples': total_samples,
        'chunk_size': chunk_size,
        'num_chunks': (total_samples + chunk_size - 1) // chunk_size
    }
    
    with open(os.path.join(save_dir, 'chunk_info.json'), 'w') as f:
        json.dump(chunk_info, f)
    
    for chunk_idx in range(0, total_samples, chunk_size):
        chunk_end = min(chunk_idx + chunk_size, total_samples)
        current_chunk_size = chunk_end - chunk_idx
        
        print(f"Tokenizing chunk {chunk_idx//chunk_size + 1}/{chunk_info['num_chunks']}: samples {chunk_idx}-{chunk_end-1}")
        
        # Get current chunk text
        buggy_chunk = buggy[chunk_idx:chunk_end]
        fixed_chunk = fixed[chunk_idx:chunk_end]
        
        # Combine buggy and fixed for this chunk
        chunk_text = list(buggy_chunk) + list(fixed_chunk)
        
        # Tokenize chunk
        chunk_encodings = tokenizer(
            chunk_text,
            padding=True,
            truncation=True,
            max_length=4000,
            return_tensors="pt"
        )
        
        chunk_data = {
            'input_ids': chunk_encodings['input_ids'],
            'attention_mask': chunk_encodings['attention_mask'],
            'chunk_size': current_chunk_size,
            'chunk_idx': chunk_idx
        }
        
        chunk_file = os.path.join(save_dir, f'chunk_{chunk_idx//chunk_size:04d}.pkl')
        with open(chunk_file, 'wb') as f:
            pickle.dump(chunk_data, f)
        
        del chunk_encodings, chunk_data, chunk_text, buggy_chunk, fixed_chunk
        gc.collect()
    
    print(f"Tokenization complete! Saved {chunk_info['num_chunks']} chunks to {save_dir}")
    return save_dir

save_path = "/mimer/NOBACKUP/groups/naiss2025-5-243/tokenized_chunks2"
tokenized_dir = pretokenize_in_chunks(buggy, fixed, tokenizer, chunk_size=20000, save_dir=save_path)
print(f"Tokenized data saved to: {tokenized_dir}")

Tokenizing 456749 samples in chunks of 20000...
Tokenizing chunk 1/23: samples 0-19999
Tokenizing chunk 2/23: samples 20000-39999
Tokenizing chunk 3/23: samples 40000-59999
Tokenizing chunk 4/23: samples 60000-79999
Tokenizing chunk 5/23: samples 80000-99999
Tokenizing chunk 6/23: samples 100000-119999
Tokenizing chunk 7/23: samples 120000-139999
Tokenizing chunk 8/23: samples 140000-159999
Tokenizing chunk 9/23: samples 160000-179999
Tokenizing chunk 10/23: samples 180000-199999
Tokenizing chunk 11/23: samples 200000-219999
Tokenizing chunk 12/23: samples 220000-239999
Tokenizing chunk 13/23: samples 240000-259999
Tokenizing chunk 14/23: samples 260000-279999
Tokenizing chunk 15/23: samples 280000-299999
Tokenizing chunk 16/23: samples 300000-319999
Tokenizing chunk 17/23: samples 320000-339999
Tokenizing chunk 18/23: samples 340000-359999
Tokenizing chunk 19/23: samples 360000-379999
Tokenizing chunk 20/23: samples 380000-399999
Tokenizing chunk 21/23: samples 400000-419999
Tokenizin