In [5]:
!pip install transformers
!pip install datasets==2.19.0
!pip install torch



In [6]:
# !pip install accelerate -U

In [7]:
import torch
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [8]:

# Function to load and prepare the PAWS dataset
def load_and_prepare_paws():
    dataset = load_dataset("paws", "labeled_final")
    train_dataset = dataset['train'].filter(lambda example: example['label'] == 1)
    valid_dataset = dataset['validation'].filter(lambda example: example['label'] == 1)
    print(train_dataset.shape)
    print(valid_dataset.shape)

    # Prepare datasets by formatting them to suit the T5 input requirements
    def prepare_examples(example):
        source = f"paraphrase: {example['sentence1']} </s>"
        target = example['sentence2']
        return {'source': source, 'target': target}

    train_dataset = train_dataset.map(prepare_examples, remove_columns=['sentence1', 'sentence2', 'id', 'label'])
    valid_dataset = valid_dataset.map(prepare_examples, remove_columns=['sentence1', 'sentence2', 'id', 'label'])

    return train_dataset, valid_dataset

train_dataset, valid_dataset = load_and_prepare_paws()

(21829, 4)
(3539, 4)


In [9]:
def print_first_few(data, num_examples=3):
    for i in range(num_examples):
        print(f"Example {i+1}:")
        print(f"Source: {data[i]['source']}")
        print(f"Target: {data[i]['target']}\n")

# Display the first few examples
print("First few training examples:")
print_first_few(train_dataset)

First few training examples:
Example 1:
Source: paraphrase: The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association . </s>
Target: The 1975 -- 76 season of the National Basketball Association was the 30th season of the NBA .

Example 2:
Source: paraphrase: When comparable rates of flow can be maintained , the results are high . </s>
Target: The results are high when comparable flow rates can be maintained .

Example 3:
Source: paraphrase: It is the seat of Zerendi District in Akmola Region . </s>
Target: It is the seat of the district of Zerendi in Akmola region .



In [10]:
# Set up the tokenizer and model
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
if torch.cuda.is_available():
    model = model.cuda()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['source'], pad_to_max_length=True, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target'], pad_to_max_length=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3539 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
# Data collator that dynamically pads the batches
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)

# Initialize the trainer with data collator
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2195,0.211036
2,0.2024,0.203322
3,0.1888,0.202362


TrainOutput(global_step=8187, training_loss=0.23990501524022578, metrics={'train_runtime': 3653.2466, 'train_samples_per_second': 17.926, 'train_steps_per_second': 2.241, 'total_flos': 6228390381004800.0, 'train_loss': 0.23990501524022578, 'epoch': 3.0})

In [14]:

# # Train the model
# trainer.train()

# Saving the fine-tuned model and tokenizer
model_path = "./paraphrase_t5_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)





('./paraphrase_t5_model/tokenizer_config.json',
 './paraphrase_t5_model/special_tokens_map.json',
 './paraphrase_t5_model/spiece.model',
 './paraphrase_t5_model/added_tokens.json')

In [15]:
# Function to use the model for paraphrasing
def paraphrase(input_text):
    model.eval()
    input_ids = tokenizer.encode("paraphrase: " + input_text + " </s>", return_tensors="pt")
    if torch.cuda.is_available():
        input_ids = input_ids.to('cuda')
    outputs = model.generate(input_ids)
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text

In [18]:

# Example usage
print(paraphrase("The quick brown fox jumps over the lazy dog."))

The quick brown fox jumps over the lazy dog.


Mounted at /content/drive
