In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import LoraConfig, get_peft_model, TaskType
import torch
import numpy as np

In [None]:
MODEL_NAME = "Salesforce/codet5-base"
OUTPUT_DIR = "outputs/codet5-lora"
MAX_INPUT = 128
MAX_OUTPUT = 64
EPOCHS = 1     #inc later
BATCH = 4
LR = 5e-4

In [None]:
dataset = load_dataset("code_x_glue_ct_code_to_text", "python")
print(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(batch):
    inputs = [f"summarize: {code}" for code in batch["code"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT, truncation=True, padding="max_length")
    labels = tokenizer(batch["docstring"], max_length=MAX_OUTPUT, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,     # scaling
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),
)

In [None]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"].select(range(40000)),
    eval_dataset=tokenized["validation"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()

In [None]:
tokenizer.save_pretrained(OUTPUT_DIR)

print("Training done. Model + LoRA adapter saved in:", OUTPUT_DIR)

In [None]:
def summarize(code):
    inputs = tokenizer(f"summarize: {code}", return_tensors="pt", truncation=True).to(model.device)
    output = model.generate(**inputs, max_length=MAX_OUTPUT)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print("\n Test prediction:")
print(summarize("def factorial(n): return 1 if n==0 else n*factorial(n-1)"))