In [None]:
# from datasets import Dataset
# import json

# # Load your dataset file (assuming JSONL or similar structure)
# path = "../dataset/data/python/processed_with_verdict/test.jsonl"
# limit = 10

# data = []
# with open(path, 'r', encoding='utf-8') as f:
#     for i, line in enumerate(f):
#         if i >= limit:
#             break
#         data.append(json.loads(line.strip()))

# # Convert 'src' and 'tgt' (token lists) into strings
# for entry in data:
#     print(entry)
#     entry["source"] = " ".join(entry["src"])
#     entry["target"] = " ".join(entry["tgt"])

# # Create HuggingFace Dataset
# hf_dataset = Dataset.from_list(data)

In [1]:
from datasets import Dataset

# Dummy data simulating (buggy_code, fixed_code) pairs
dummy_data = [
    {
        "src_id": "sample001_buggy",
        "src": ["print", "(", "'hello", "world'", ")", "NEW_LINE"],
        "src_verdict": "Wrong Answer",
        "tgt": ["print", "(", "'hello world'", ")", "NEW_LINE"],
        "tgt_id": "sample001_fixed"
    },
    {
        "src_id": "sample002_buggy",
        "src": ["for", "i", "in", "range", "(", "5", ")", ":", "NEW_LINE", "INDENT", "print", "(", "i", ")", "NEW_LINE", "DEDENT"],
        "src_verdict": "Wrong Answer",
        "tgt": ["for", "i", "in", "range", "(", "5", ")", ":", "NEW_LINE", "INDENT", "print", "(", "i", ")", "NEW_LINE", "DEDENT"],
        "tgt_id": "sample002_fixed"
    }
]

# Convert to HuggingFace Dataset
dataset = Dataset.from_list(dummy_data)


In [4]:

from transformers import AutoTokenizer

model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def tokenize_function(example):
    input_text = f"fix: {' '.join(example['src'])}"
    target_text = ' '.join(example['tgt'])

    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(target_text, truncation=True, padding="max_length", max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(tokenize_function)


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./codet5-dummy-checkpoint",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=1,
    save_strategy="no",  # avoid checkpointing for tiny test
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()





  0%|          | 0/5 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'loss': 12.1254, 'grad_norm': 279.8616638183594, 'learning_rate': 4e-05, 'epoch': 1.0}
{'loss': 4.2931, 'grad_norm': 235.23875427246094, 'learning_rate': 3e-05, 'epoch': 2.0}
{'loss': 4.1198, 'grad_norm': 191.2189178466797, 'learning_rate': 2e-05, 'epoch': 3.0}
{'loss': 2.993, 'grad_norm': 22.768264770507812, 'learning_rate': 1e-05, 'epoch': 4.0}
{'loss': 2.9348, 'grad_norm': 8.374885559082031, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 24.2825, 'train_samples_per_second': 0.412, 'train_steps_per_second': 0.206, 'train_loss': 5.2932213306427, 'epoch': 5.0}


TrainOutput(global_step=5, training_loss=5.2932213306427, metrics={'train_runtime': 24.2825, 'train_samples_per_second': 0.412, 'train_steps_per_second': 0.206, 'total_flos': 338354503680.0, 'train_loss': 5.2932213306427, 'epoch': 5.0})

In [6]:
trainer.save_model("./codet5-fix-model")
tokenizer.save_pretrained("./codet5-fix-model")


('./codet5-fix-model\\tokenizer_config.json',
 './codet5-fix-model\\special_tokens_map.json',
 './codet5-fix-model\\vocab.json',
 './codet5-fix-model\\merges.txt',
 './codet5-fix-model\\added_tokens.json',
 './codet5-fix-model\\tokenizer.json')

In [10]:
import torch 
def suggest_fix(code):
    model.eval()
    input_text = f"fix: {code}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    with torch.no_grad():
        output = model.generate(**inputs, max_length=128)
    
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Try a buggy code
buggy = "print ()'hello world' )"
print("🔧 Suggested fix:", suggest_fix(buggy))


🔧 Suggested fix: print ( 'hello world' )
