In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, GPT2Tokenizer, EncoderDecoderModel, AdamW

In [None]:
import json

def load_and_preprocess_data(file_path, limit=20000):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data[:limit]

# Replace with actual path
train_file = "/kaggle/input/pyfixai/train.jsonl"
train_data = load_and_preprocess_data(train_file)

print(f"Training Samples: {len(train_data)}")
print(train_data[0])

In [None]:
valid_file = "/kaggle/input/pyfixai/valid.jsonl"
valid_data = load_and_preprocess_data(valid_file)

print(f"Valid Samples: {len(valid_data)}")
print(valid_data[0])

In [None]:
class CodeFixDataset(Dataset):
    def __init__(self, data, encoder_tokenizer, decoder_tokenizer, max_length=512):
        self.data = data
        self.encoder_tokenizer = encoder_tokenizer
        self.decoder_tokenizer = decoder_tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        buggy_code = " ".join(sample['src']).replace("NEW_LINE", "\n").replace("INDENT", "").replace("DEDENT", "")
        fixed_code = " ".join(sample['tgt']).replace("NEW_LINE", "\n").replace("INDENT", "").replace("DEDENT", "")

        inputs = self.encoder_tokenizer(
            buggy_code,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels = self.decoder_tokenizer(
            fixed_code,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels_input_ids = labels["input_ids"].squeeze()
        labels_input_ids[labels_input_ids == decoder_tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": labels_input_ids
        }


In [None]:
encoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad token by default
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

# Create datasets and loaders
train_dataset = CodeFixDataset(train_data, encoder_tokenizer, decoder_tokenizer)
val_dataset = CodeFixDataset(valid_data, encoder_tokenizer, decoder_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

print(f"Train Batches: {len(train_loader)}, Validation Batches: {len(val_loader)}")

# Load encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "microsoft/codebert-base", "gpt2"
)

model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.pad_token_id = decoder_tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)
best_val_loss = float("inf")

# Training loop
for epoch in range(5):
    print(f"Starting Epoch: {epoch+1}")
    model.train()
    train_loss = 0

    for i, batch in enumerate(train_loader):
        if i%500 == 0:
            print(i)
            
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i%500 == 0:
                print(i)
                
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model.save_pretrained("best_codebert_gpt2_model")
        encoder_tokenizer.save_pretrained("best_codebert_gpt2_model")
        decoder_tokenizer.save_pretrained("best_codebert_gpt2_model")
        print("Saved new best model!")

In [None]:
!zip -r /kaggle/working/output_folder.zip /kaggle/working/best_codebert_gpt2_model

In [None]:
!ls -lh /kaggle/working/

In [None]:
import json
import requests
from google.colab import auth  # works in Kaggle too
import google.auth
from google.auth.transport.requests import Request




auth.authenticate_user()
creds, _ = google.auth.default()
creds.refresh(Request())
access_token = creds.token

In [None]:
file_path = "/kaggle/working/output_folder.zip"  # Change this
file_name = "output_folder.zip"

headers = {
    "Authorization": f"Bearer {access_token}"
}

metadata = {
    "name": file_name,
    "mimeType": "application/zip"
}

files = {
    "data": ("metadata", json.dumps(metadata), "application/json"),
    "file": open(file_path, "rb")
}

upload_url = "https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart"

res = requests.post(upload_url, headers=headers, files=files)
res.raise_for_status()

print(" Upload successful!")
print("File ID:", res.json()["id"])

In [None]:
def load_and_preprocess_data(file_path, limit=1):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data[:limit]

test_data = load_and_preprocess_data("/kaggle/input/pyfix-test/test.jsonl", limit=1)
encoder_tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
decoder_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# GPT-2 doesn't have a pad token by default
decoder_tokenizer.pad_token = decoder_tokenizer.eos_token

# Create datasets and loaders
test_dataset = CodeFixDataset(test_data, encoder_tokenizer, decoder_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=True)

model = EncoderDecoderModel.from_pretrained("/kaggle/working/best_codebert_gpt2_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def test_model(input_code: str):
    inputs = encoder_tokenizer(input_code, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True
        )
    
    decoded_output = decoder_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return decoded_output

test_snippet = test_data[0]['src']
fixed_code = test_model(test_snippet)
print("Fixed code:", fixed_code)