In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# 1. Data Augmentation
# This function demonstrates a simple form of data augmentation:
# paraphrasing to create new training examples.
def augment_data(premise, hypothesis, label):
    augmented_data = []
    # Paraphrase the premise
    paraphrased_premise = premise.replace("governed by the laws of", "subject to the jurisdiction of")
    augmented_data.append({"premise": paraphrased_premise, "hypothesis": hypothesis, "label": label})
    
    # Paraphrase the hypothesis
    paraphrased_hypothesis = hypothesis.replace("subject to the jurisdiction of", "governed by the laws of")
    augmented_data.append({"premise": premise, "hypothesis": paraphrased_hypothesis, "label": label})
    
    return augmented_data

# Original data
entailment_data = [
    {"premise": "This agreement shall be governed by the laws of the State of New York.",
     "hypothesis": "The contract is subject to the jurisdiction of New York.",
     "label": "entailment"}
]

# Augment the data
print("--- 🧬 Data Augmentation for Entailment ---")
augmented_dataset = []
for item in entailment_data:
    augmented_dataset.extend(augment_data(item['premise'], item['hypothesis'], item['label']))

print("Original Data:", entailment_data[0])
print("Augmented Data:", augmented_dataset)

# 2. Domain-Specific Fine-Tuning
# This is the same code from the previous response, but it's explicitly labeled
# here as a key technique for "improving accuracy." By training on legal
# data, the model learns the specific nuances of legal language.
def preprocess_entailment_data(examples):
    inputs = [f"mnli premise: {p} hypothesis: {h}" for p, h in zip(examples['premise'], examples['hypothesis'])]
    outputs = [l for l in examples['label']]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=16, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Create a dataset (combining original and augmented data)
dataset = Dataset.from_dict({"premise": [d['premise'] for d in augmented_dataset],
                             "hypothesis": [d['hypothesis'] for d in augmented_dataset],
                             "label": [d['label'] for d in augmented_dataset]})

tokenized_dataset = dataset.map(preprocess_entailment_data, batched=True)

# Training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./entailment_model_fine_tuned",
    per_device_train_batch_size=2,
    num_train_epochs=10, # More epochs for better fine-tuning
    save_total_limit=1,
)

# Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
#     tokenizer=tokenizer,
# )

# Start the fine-tuning process
# print("\n--- ⚖️ Domain-Specific Fine-Tuning ---")
# print("Starting fine-tuning with a larger, augmented dataset...")
# trainer.train()
# print("Fine-tuning complete. Model saved to './entailment_model_fine_tuned'.")
