# 🔧 Train CodeT5 (YAML Generator) with Minimal Setup in Google Colab

In [None]:
!pip install --upgrade transformers==4.41.2 peft==0.10.0 datasets accelerate --quiet


In [None]:
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]


In [None]:
import json
from datasets import Dataset

with open(file_name, 'r') as f:
    raw_data = [json.loads(line) for line in f.readlines()]

dataset = Dataset.from_list(raw_data).train_test_split(test_size=0.1)
dataset


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "Salesforce/codet5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)


In [None]:
max_input_len = 256
max_target_len = 512

def tokenize(example):
    input_enc = tokenizer(example["inputs"], max_length=max_input_len, padding="max_length", truncation=True)
    target_enc = tokenizer(example["targets"], max_length=max_target_len, padding="max_length", truncation=True)
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

tokenized = dataset.map(tokenize, remove_columns=["inputs", "targets"])


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=False,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer)
)


In [None]:
trainer.train()

In [None]:
prompt = "Design an API to list all devices, including fields like hostname, OS version, and tech stack."
inputs = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=512)
yaml_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

with open("generated_spec.yaml", "w") as f:
    f.write(yaml_output)

print("📄 YAML Generated:")
print(yaml_output)
