In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# 1. Create a mock dataset for legal textual entailment.
# In practice, this would be a much larger, carefully curated dataset.
# The format is "premise: [legal clause] hypothesis: [legal statement]"
entailment_data = [
    {"premise": "This agreement shall be governed by the laws of the State of New York.",
     "hypothesis": "The contract is subject to the jurisdiction of New York.",
     "label": "entailment"},
    {"premise": "The Company shall provide a guarantee.",
     "hypothesis": "The Company is not required to provide a guarantee.",
     "label": "contradiction"},
    {"premise": "This is a confidentiality clause.",
     "hypothesis": "The contract is about a sale of goods.",
     "label": "neutral"}
]

# 2. Define the SLM and tokenizer.
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 3. Pre-process the data for T5 fine-tuning.
def preprocess_entailment_data(examples):
    # T5 uses a text-to-text format. We format the input and output strings.
    inputs = [f"mnli premise: {p} hypothesis: {h}" for p, h in zip(examples['premise'], examples['hypothesis'])]
    outputs = [l for l in examples['label']]
    
    # Tokenize the inputs and outputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=16, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert data to a Hugging Face Dataset object
dataset = Dataset.from_dict({"premise": [d['premise'] for d in entailment_data],
                             "hypothesis": [d['hypothesis'] for d in entailment_data],
                             "label": [d['label'] for d in entailment_data]})

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_entailment_data, batched=True)

# 4. Set up the training arguments and Trainer.
training_args = TrainingArguments(
    output_dir="./entailment_model",
    per_device_train_batch_size=2, # Small batch size due to the tiny dataset
    num_train_epochs=5,
    save_total_limit=1,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)




In [None]:
# 5. Fine-tune the model.
# trainer.train()

# --- Example of inference after training ---
# To use the fine-tuned model for inference:
# load_model = T5ForConditionalGeneration.from_pretrained("./entailment_model")
# load_tokenizer = T5Tokenizer.from_pretrained("./entailment_model")
#
# def predict_entailment(premise, hypothesis):
#     input_text = f"mnli premise: {premise} hypothesis: {hypothesis}"
#     input_ids = load_tokenizer(input_text, return_tensors="pt").input_ids
#     outputs = load_model.generate(input_ids)
#     return load_tokenizer.decode(outputs[0], skip_special_tokens=True)
#
# result = predict_entailment("The contract requires arbitration.", "The parties must resolve disputes via arbitration.")
# print(f"Prediction: {result}")v