In [3]:
# !pip install datasets

# Imports

In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
import os
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Loading And Preprocessing

In [4]:
# Load the PAWS dataset
dataset = load_dataset("paws", "labeled_final")

# Convert to DataFrame and filter paraphrases (label = 1)
def preprocess_paws(dataset, label=1):
    df = pd.DataFrame(dataset)
    df = df[df['label'] == label]
    df['input_text'] = "paraphrase: " + df['sentence1']
    df['target_text'] = df['sentence2']
    return df[['input_text', 'target_text']]

train_data = preprocess_paws(dataset['train'])
test_data = preprocess_paws(dataset['test'])
validation_data = preprocess_paws(dataset['validation'])

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
validation_dataset = Dataset.from_pandas(validation_data)

README.md:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

# Tokenization

In [5]:
# Initialize tokenizer and model
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples['input_text'], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples['target_text'], max_length=128, truncation=True, padding="max_length")
    inputs['labels'] = targets['input_ids']
    return inputs

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3539 [00:00<?, ? examples/s]

# T5 Model Fine Tuning

In [6]:
# Set directories in Google Drive
results_dir = "/content/drive/MyDrive/results"
model_dir = "/content/drive/MyDrive/saved_t5_model"

# Create the directories if they don't exist
os.makedirs(results_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [8]:

# Define training arguments
training_args = TrainingArguments(
    output_dir=results_dir,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=500,  # Save model every 500 steps
    fp16=True,  # Mixed precision training
    report_to="none"  # Avoid unwanted logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Start training
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.1377,0.125782
1000,0.1216,0.118837
1500,0.1117,0.116899


TrainOutput(global_step=1875, training_loss=0.26194957275390623, metrics={'train_runtime': 693.4524, 'train_samples_per_second': 21.631, 'train_steps_per_second': 2.704, 'total_flos': 2283592089600000.0, 'train_loss': 0.26194957275390623, 'epoch': 5.0})

# Save Model and Tokenizer

In [9]:
# Save the final model
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('/content/drive/MyDrive/saved_t5_model/tokenizer_config.json',
 '/content/drive/MyDrive/saved_t5_model/special_tokens_map.json',
 '/content/drive/MyDrive/saved_t5_model/spiece.model',
 '/content/drive/MyDrive/saved_t5_model/added_tokens.json')

# Evaluation

In [12]:
# !pip install evaluate
# !pip install rouge_score

In [27]:
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

In [28]:
import torch
import torch
import evaluate


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define evaluation function for ROUGE and BLEU
def evaluate_model(test_data, model, tokenizer, batch_size=16, max_length=128, num_beams=5):
    # Initialize metrics
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")

    # Store references and predictions
    references = []
    predictions = []

    # Iterate through the test dataset in batches
    for start_idx in range(0, len(test_data), batch_size):
        end_idx = min(start_idx + batch_size, len(test_data))
        batch = test_data[start_idx:end_idx]

        # Prepare the batch input
        input_texts = batch["input_text"].tolist()
        target_texts = batch["target_text"].tolist()

        # Tokenize the inputs in batch
        inputs = tokenizer(input_texts, return_tensors="pt", truncation=True, max_length=max_length, padding=True)
        inputs = inputs.to(device)

        # Generate predictions in batch
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True
            )

        # Decode and store predictions
        for i in range(len(outputs)):
            predicted_text = tokenizer.decode(outputs[i], skip_special_tokens=True)
            references.append(target_texts[i])  # ROUGE expects reference as strings, not tokenized
            predictions.append(predicted_text)  # ROUGE expects prediction as a string, not tokenized

            # Add to ROUGE metric
            rouge.add(prediction=predicted_text, reference=target_texts[i])

    # Calculate BLEU and ROUGE scores
    bleu_score = bleu.compute(predictions=predictions, references=[r.split() for r in references])  # BLEU expects tokenized references
    rouge_score = rouge.compute()

    # Print results
    print("ROUGE Score:", rouge_score)
    print("BLEU Score:", bleu_score["bleu"])


# Evaluate on the test dataset
evaluate_model(test_data, model, tokenizer)


ROUGE Score: {'rouge1': 0.9337398926014302, 'rouge2': 0.7653266433732168, 'rougeL': 0.850961382384321, 'rougeLsum': 0.8508104928620868}
BLEU Score: 0.0067185102602188825


# Inference (Paraphrase Generation System)

In [43]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer from the saved directory
model = T5ForConditionalGeneration.from_pretrained(model_dir)
tokenizer = T5Tokenizer.from_pretrained(model_dir)

# Set the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Preprocessing function for inference
def preprocess_input(sentence):
    return "paraphrase: " + sentence

# Generate paraphrases with corrected num_beams and num_return_sequences
def generate_paraphrase(input_text, model, tokenizer, max_length=128, num_beams=5, num_return_sequences=4, top_k=100, top_p=0.9, temperature=1.0):
    # Preprocess input
    input_text = preprocess_input(input_text)

    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate paraphrases
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length + 20,  # Increase max_length for more room in paraphrases
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
        top_k=top_k,              # Use top-k sampling for diversity
        top_p=top_p,              # Use top-p sampling for nucleus-based sampling
        temperature=temperature,  # Encourage more exploratory generation
        do_sample=True,           # Enable sampling for top-k and top-p
        early_stopping=True
    )

    # Decode generated outputs
    paraphrased_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return paraphrased_texts

# Example sentence
input_sentence = "The quick brown fox jumps over the lazy dog."

# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: The quick brown fox jumps over the lazy dog.
Paraphrase 1: The quick brown fox jumps over the lazy dog.
Paraphrase 2: The quick brown fox leaps over the lazy dog.
Paraphrase 3: The fast brown fox jumps over the lazy dog.
Paraphrase 4: Quick brown fox jumps over the lazy dog.


In [44]:
# Example sentence
input_sentence = "She enjoys reading books on rainy afternoons."

# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: She enjoys reading books on rainy afternoons.
Paraphrase 1: She enjoys reading books on rainy afternoons.
Paraphrase 2: She enjoys reading on rainy afternoons.
Paraphrase 3: On rainy afternoons she enjoys reading books.
Paraphrase 4: She loves reading books on rainy afternoons.


In [45]:
# Example sentence
input_sentence = "The dog barked loudly at the stranger outside the house."


# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: The dog barked loudly at the stranger outside the house.
Paraphrase 1: The dog barked loudly at the stranger outside the house.
Paraphrase 2: The dog loudly barked at the stranger outside the house.
Paraphrase 3: The dog barked loudly at a stranger outside the house.
Paraphrase 4: The dog barked loudly at the stranger outside the house .


In [46]:
# Example sentence
input_sentence = "Climate change is one of the most pressing issues of our time."


# Generate paraphrases
paraphrased_sentences = generate_paraphrase(
    input_sentence, model, tokenizer, num_return_sequences=4
)

# Display results
print(f"Original: {input_sentence}")
for i, paraphrase in enumerate(paraphrased_sentences, 1):
    print(f"Paraphrase {i}: {paraphrase}")

Original: Climate change is one of the most pressing issues of our time.
Paraphrase 1: Climate change is one of the most pressing issues of our time.
Paraphrase 2: Climate change is one of the most pressing issues of our time .
Paraphrase 3: The climate change is one of the most pressing issues of our time.
Paraphrase 4: Climate Change is one of the most pressing issues of our time.
