# Fine-tune CodeT5 to Generate YAML API Specs from Natural Language

In [None]:
!pip install transformers datasets peft trl accelerate evaluate --quiet

In [None]:

from datasets import load_dataset, Dataset
import json

# Upload your JSONL training file in Colab
from google.colab import files
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
with open(file_name, 'r') as f:
    lines = [json.loads(l) for l in f.readlines()]

dataset = Dataset.from_list(lines)
dataset = dataset.train_test_split(test_size=0.1)
dataset


In [None]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

model_name = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:

max_input_len = 256
max_target_len = 512

def preprocess(example):
    inputs = tokenizer(example["inputs"], max_length=max_input_len, truncation=True, padding="max_length")
    targets = tokenizer(example["targets"], max_length=max_target_len, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)


In [None]:

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    logging_dir="./logs",
    fp16=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)


In [None]:
trainer.train()

## Inference: Generate YAML from Natural Language Instruction

In [None]:

prompt = "Design an API to list all logs with timestamp, level, message, and source."
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=512)
yaml_result = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Save for downstream use
with open("generated_spec.yaml", "w") as f:
    f.write(yaml_result)

print(yaml_result)
