In [1]:
import json
from datasets import Dataset

# Load JSON data
with open("testData.json") as f:
    data = json.load(f)

# Convert to HuggingFace Dataset and split
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)  # 80/20 split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
max_input_length = 1024
max_output_length = 128

def preprocess(example):
    inputs = tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )
    labels = tokenizer(
        example["output"],
        truncation=True,
        padding="max_length",
        max_length=max_output_length
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess, batched=False)
tokenized_datasets


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map: 100%|██████████| 18/18 [00:00<00:00, 285.22 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 362.03 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 18
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
})

In [4]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./flan-t5-lora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    logging_steps=5,
    # Use 'evaluation_strategy' only if supported:
    # evaluation_strategy="epoch", 
    # save_strategy="epoch",
    learning_rate=1e-4,
    save_total_limit=2,
    fp16=False,
    push_to_hub=False
)





In [7]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)


In [8]:
trainer.train()




Step,Training Loss
5,14.3855
10,19.0981
15,8.5242
20,18.7915
25,7.8679
30,13.7465
35,23.6146
40,12.916
45,24.5595
50,2.5776


TrainOutput(global_step=54, training_loss=14.652636351408782, metrics={'train_runtime': 289.8449, 'train_samples_per_second': 0.186, 'train_steps_per_second': 0.186, 'total_flos': 74247328235520.0, 'train_loss': 14.652636351408782, 'epoch': 3.0})

In [9]:
model.eval()

example_input = dataset["train"][0]["input"]
inputs = tokenizer(example_input, return_tensors="pt", truncation=True, max_length=max_input_length)
outputs = model.generate(**inputs, max_length=max_output_length)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Input:", example_input[:100], "...")
print("Prediction:", decoded)

Input: REAL ESTATE PURCHASE AND SALE AGREEMENT

Date of Execution: November 9, 2025
Contract Number: REPSA- ...
Prediction: REPSA-2025-7821


In [10]:
# Save the full model including LoRA adapters
model.save_pretrained("./flan-t5-lora-finetuned")
tokenizer.save_pretrained("./flan-t5-lora-finetuned")

('./flan-t5-lora-finetuned\\tokenizer_config.json',
 './flan-t5-lora-finetuned\\special_tokens_map.json',
 './flan-t5-lora-finetuned\\spiece.model',
 './flan-t5-lora-finetuned\\added_tokens.json')