# 🧠 Fine-tune T5-base to Generate JSON Tickets from Support Issues

This notebook trains a `t5-base` model using HuggingFace Transformers on a dataset of customer issue descriptions and structured ticket JSON outputs. It includes evaluation metrics like ROUGE and exact match.

In [3]:
# ✨ Import required libraries
import json
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch
import random
import evaluate
import numpy as np
from google.colab import files

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/aristideisingizwe/Documents/projects/sautidesk/sautidesk-model/venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/aristideisingizwe/Documents/projects/sautidesk/sautidesk-model/venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.st

ModuleNotFoundError: No module named 'google'

In [None]:
# ✨ Upload your JSON file
data_file = files.upload()  # Upload `synthetic_tickets_updated.json`

with open("data/training_data.json", "r") as f:
    raw_data = json.load(f)

print(f"✅ Loaded {len(raw_data)} samples")

In [None]:
# ✨ Prepare dataset

def format_example(example):
    input_text = f"Generate ticket from: {example['text']}"
    target_text = json.dumps(example["label"])
    return {"input": input_text, "target": target_text}

formatted_data = [format_example(example) for example in raw_data]
dataset = Dataset.from_list(formatted_data)

# Split into train/test
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
# ✨ Tokenize
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)

MAX_INPUT_LEN = 256
MAX_TARGET_LEN = 512

def tokenize_function(example):
    model_inputs = tokenizer(
        example["input"],
        max_length=MAX_INPUT_LEN,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        example["target"],
        max_length=MAX_TARGET_LEN,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_test = test_data.map(tokenize_function, batched=True)

In [None]:
# ✨ Load model
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# ✨ Define metrics
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    exact_matches = [int(p == l) for p, l in zip(decoded_preds, decoded_labels)]
    exact_match_score = np.mean(exact_matches)

    result["exact_match"] = exact_match_score
    return result

In [None]:
# ✨ Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-ticket-output",
    evaluation_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_steps=20,
    report_to="none"
)

In [None]:
# ✨ Trainer setup
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# ✨ Train!
trainer.train()

In [None]:
# ✅ Save model
model.save_pretrained("models/t5-ticket-model")
tokenizer.save_pretrained("models/t5-ticket-model")

## ✅ Predict Example
Use the model to generate a ticket from text:

In [None]:
def predict_ticket(issue_text):
    input_text = f"Generate ticket from: {issue_text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    output = model.generate(**inputs, max_length=512)
    return tokenizer.decode(output[0], skip_special_tokens=True)

predict_ticket("My electricity has been out since last night. Please help.")