# 🚀 Fine-tune `flan-t5-small` to Generate YAML Specs from Natural Prompts (Colab-Compatible)

In [1]:
# ✅ Install minimal required packages (safe with NumPy 2.0.2)
!pip install transformers datasets --quiet


In [12]:
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]


Saving flan_t5_yaml_training_data_300.jsonl to flan_t5_yaml_training_data_300.jsonl


In [13]:
import json
from datasets import Dataset

with open(file_name, 'r') as f:
    raw_data = [json.loads(line) for line in f.readlines()]

dataset = Dataset.from_list(raw_data).train_test_split(test_size=0.1)
dataset


DatasetDict({
    train: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 270
    })
    test: Dataset({
        features: ['inputs', 'targets'],
        num_rows: 30
    })
})

In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [23]:
max_input_len = 256
max_target_len = 512

def tokenize(example):
    input_enc = tokenizer(example["inputs"], max_length=max_input_len, truncation=True, padding="max_length")
    target_enc = tokenizer(example["targets"], max_length=max_target_len, truncation=True, padding="max_length")
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

tokenized = dataset.map(tokenize, remove_columns=["inputs", "targets"])


Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [24]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    logging_strategy="epoch",       # log after each epoch
    logging_first_step=True,        # log on the first step
    eval_strategy="epoch",
    report_to="none",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    predict_with_generate=True,
    fp16=False,
    logging_dir="./logs"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer)
)


  trainer = Seq2SeqTrainer(


In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.2987,0.003403
2,0.0041,2.4e-05
3,0.0012,1.7e-05
4,0.0006,1e-05
5,0.0003,9e-06


TrainOutput(global_step=340, training_loss=0.5996799159904613, metrics={'train_runtime': 950.9422, 'train_samples_per_second': 1.42, 'train_steps_per_second': 0.358, 'total_flos': 462211212902400.0, 'train_loss': 0.5996799159904613, 'epoch': 5.0})

## 🔍 Inference: Generate YAML from Prompt

In [30]:
# Move the model to the GPU
model.to("cuda")

prompt = """
Generate a clean YAML API specification for managing support tickets.

Requirements:
- Table name: tickets
- Class name: Ticket
- Fields: id (Integer, primary_key), title (String), description (String), priority (String), status (String)
- One endpoint:
    - path: /tickets
    - method: GET
    - action: list_all

Ensure the YAML is well-structured and no keys are repeated or malformed.
"""



inputs = tokenizer(prompt, return_tensors="pt").input_ids

# Move the input tensor to the GPU
inputs = inputs.to("cuda")

outputs = model.generate(
    inputs,
    max_new_tokens=512,
    repetition_penalty=1.2,  # 🔁 discourages token loops
    num_beams=4,              # 🔍 beam search improves structure
    early_stopping=True
)
yaml_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

with open("generated_spec.yaml", "w") as f:
    f.write(yaml_output)

print("📄 Generated YAML:")
print(yaml_output)

📄 Generated YAML:
table: tickets class_name: Ticket columns: - name: id type: Integer primary_key: true - name: primary_key type: String - name: title type: String - name: description type: String - name: priority type: String - name: status type: String endpoints: - path: /tickets method: GET action: list_all 
