In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, GPT2Tokenizer, EncoderDecoderModel, AdamW

In [3]:
import json

def load_and_preprocess_data(file_path, limit=20000):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data[:limit]

# Replace with actual path
train_file = "/kaggle/input/pyfixai/train.jsonl"
train_data = load_and_preprocess_data(train_file)

print(f"Training Samples: {len(train_data)}")
print(train_data[0])

Training Samples: 20000
{'src_id': 'p00001_s631177546', 'src': ['from', 'sys', 'import', 'stdin', 'NEW_LINE', 'x', '=', '[', 'int', '(', 'input', '(', ')', ')', 'for', 'i', 'in', 'range', '(', '10', ')', ']', 'NEW_LINE', 'x', '.', 'reverse', '(', ')', 'NEW_LINE', 'for', 'i', 'in', 'range', '(', '3', ')', ':', 'NEW_LINE', 'INDENT', 'print', '(', 'i', ')', 'NEW_LINE', 'DEDENT'], 'src_verdict': 'Wrong Answer', 'tgt': ['from', 'sys', 'import', 'stdin', 'NEW_LINE', 'x', '=', '[', 'int', '(', 'input', '(', ')', ')', 'for', 'i', 'in', 'range', '(', '10', ')', ']', 'NEW_LINE', 'x', '.', 'sort', '(', 'reverse', '=', 'True', ')', 'NEW_LINE', 'for', 'i', 'in', 'range', '(', '3', ')', ':', 'NEW_LINE', 'INDENT', 'print', '(', 'x', '[', 'i', ']', ')', 'NEW_LINE', 'DEDENT'], 'tgt_id': 'p00001_s854661751'}


In [6]:
valid_file = "/kaggle/input/pyfixai/valid.jsonl"
valid_data = load_and_preprocess_data(valid_file)

print(f"Valid Samples: {len(valid_data)}")
print(valid_data[0])

Valid Samples: 20000
{'src_id': 'p02548_s429693143', 'src': ['N', '=', 'int', '(', 'input', '(', ')', ')', 'NEW_LINE', 'K', '=', '0', 'NEW_LINE', 'for', 'C', 'in', 'range', '(', '1', ',', 'N', ')', ':', 'NEW_LINE', 'INDENT', 'K', '+=', 'sum', '(', 'A', '*', 'B', '==', 'N', '-', 'C', 'for', 'A', 'in', 'range', '(', '1', ',', 'N', ')', 'for', 'B', 'in', 'range', '(', '1', ',', 'N', ')', ')', 'NEW_LINE', 'DEDENT', 'print', '(', 'K', ')', 'NEW_LINE'], 'src_verdict': 'Time Limit Exceeded', 'tgt': ['import', 'math', 'NEW_LINE', 'N', '=', 'int', '(', 'input', '(', ')', ')', 'NEW_LINE', 'A', '=', '[', '0', ']', '*', '(', 'N', ')', 'NEW_LINE', 'for', 'i', 'in', 'range', '(', '0', ',', 'N', ')', ':', 'NEW_LINE', 'INDENT', 'A', '[', 'i', ']', '=', 'math', '.', 'floor', '(', '(', 'N', '-', '1', ')', '/', '(', 'i', '+', '1', ')', ')', 'NEW_LINE', 'DEDENT', 'print', '(', 'sum', '(', 'A', ')', ')', 'NEW_LINE'], 'tgt_id': 'p02548_s184073642'}


In [3]:
class CodeFixDataset(Dataset):
    def __init__(self, data, encoder_tokenizer, decoder_tokenizer, max_length=512):
        self.data = data
        self.encoder_tokenizer = encoder_tokenizer
        self.decoder_tokenizer = decoder_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        buggy_code = " ".join(sample['src']).replace("NEW_LINE", "\n").replace("INDENT", "").replace("DEDENT", "")
        fixed_code = " ".join(sample['tgt']).replace("NEW_LINE", "\n").replace("INDENT", "").replace("DEDENT", "")

        inputs = self.encoder_tokenizer(
            buggy_code,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels = self.decoder_tokenizer(
            fixed_code,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels_input_ids = labels["input_ids"].squeeze()
        labels_input_ids[labels_input_ids == decoder_tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels_input_ids
        }


In [8]:
encoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad token by default
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

# Create datasets and loaders
train_dataset = CodeFixDataset(train_data, encoder_tokenizer, decoder_tokenizer)
val_dataset = CodeFixDataset(valid_data, encoder_tokenizer, decoder_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

print(f"Train Batches: {len(train_loader)}, Validation Batches: {len(val_loader)}")

# Load encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "microsoft/codebert-base", "gpt2"
)

model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Train Batches: 5000, Validation Batches: 5000


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

EncoderDecoderModel(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNo

In [9]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
best_val_loss = float("inf")

# Training loop
for epoch in range(5):
    print(f"Starting Epoch: {epoch+1}")
    model.train()
    train_loss = 0

    for i, batch in enumerate(train_loader):
        if i%500 == 0:
            print(i)
            
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i%500 == 0:
                print(i)
                
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained("best_codebert_gpt2_model")
        encoder_tokenizer.save_pretrained("best_codebert_gpt2_model")
        decoder_tokenizer.save_pretrained("best_codebert_gpt2_model")
        print("Saved new best model!")



Starting Epoch: 1
0


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 1, Train Loss: 0.7695
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 1, Validation Loss: 1.1680
Saved new best model!
Starting Epoch: 2
0


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 2, Train Loss: 0.4079
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 2, Validation Loss: 1.1628
Saved new best model!
Starting Epoch: 3
0


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 3, Train Loss: 0.2793
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 3, Validation Loss: 1.2014
Starting Epoch: 4
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 4, Train Loss: 0.2046
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 4, Validation Loss: 1.2364
Starting Epoch: 5
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 5, Train Loss: 0.1562
0
500
1000
1500
2000
2500
3000
3500
4000
4500
Epoch 5, Validation Loss: 1.2736


In [11]:
!zip -r /kaggle/working/output_folder.zip /kaggle/working/best_codebert_gpt2_model

updating: kaggle/working/best_codebert_gpt2_model/ (stored 0%)
updating: kaggle/working/best_codebert_gpt2_model/merges.txt (deflated 53%)
updating: kaggle/working/best_codebert_gpt2_model/generation_config.json (deflated 24%)
updating: kaggle/working/best_codebert_gpt2_model/vocab.json (deflated 68%)
updating: kaggle/working/best_codebert_gpt2_model/model.safetensors (deflated 7%)
updating: kaggle/working/best_codebert_gpt2_model/config.json (deflated 76%)
updating: kaggle/working/best_codebert_gpt2_model/tokenizer_config.json (deflated 56%)
updating: kaggle/working/best_codebert_gpt2_model/special_tokens_map.json (deflated 74%)


In [12]:
!ls -lh /kaggle/working/

total 984M
drwxr-xr-x 2 root root 4.0K Apr  6 08:45 best_codebert_gpt2_model
-rw-r--r-- 1 root root 984M Apr  6 13:39 output_folder.zip


In [14]:
import json
import requests
from google.colab import auth  # works in Kaggle too
import google.auth
from google.auth.transport.requests import Request




auth.authenticate_user()
creds, _ = google.auth.default()
creds.refresh(Request())
access_token = creds.token

In [17]:
file_path = "/kaggle/working/output_folder.zip"  # Change this
file_name = "output_folder.zip"

headers = {
    "Authorization": f"Bearer {access_token}"
}

metadata = {
    "name": file_name,
    "mimeType": "application/zip"
}

files = {
    "data": ("metadata", json.dumps(metadata), "application/json"),
    "file": open(file_path, "rb")
}

upload_url = "https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart"

res = requests.post(upload_url, headers=headers, files=files)
res.raise_for_status()

print("✅ Upload successful!")
print("📁 File ID:", res.json()["id"])

✅ Upload successful!
📁 File ID: 1yCQVs_tisx7_SmBftMte6MkxexQ5FDIG


In [4]:
def load_and_preprocess_data(file_path, limit=1):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data[:limit]

test_data = load_and_preprocess_data("/kaggle/input/pyfix-test/test.jsonl", limit=1)
encoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad token by default
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

# Create datasets and loaders
test_dataset = CodeFixDataset(test_data, encoder_tokenizer, decoder_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

model = EncoderDecoderModel.from_pretrained("/kaggle/working/best_codebert_gpt2_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def test_model(input_code: str):
    inputs = encoder_tokenizer(input_code, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )
    
    decoded_output = decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return decoded_output

test_snippet = test_data[0]['src']
fixed_code = test_model(test_snippet)
print("Fixed code:", fixed_code)

OSError: Incorrect path_or_model_id: '/kaggle/working/best_codebert_gpt2_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.