# Importing DataSet & Libs

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import os

from datasets import load_dataset
from transformers import AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
dataset = load_dataset('paws', 'labeled_final')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

labeled_final/train-00000-of-00001.parqu(…):   0%|          | 0.00/8.43M [00:00<?, ?B/s]

labeled_final/test-00000-of-00001.parque(…):   0%|          | 0.00/1.24M [00:00<?, ?B/s]

labeled_final/validation-00000-of-00001.(…):   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

# Preprocessing DataSet

In [3]:
def preprocess_paws(dataset, label= 1):
    df = dataset.to_pandas()
    df = df[df['label'] == label] # keep only label rows.

    df['input_text'] = 'paraphrase:' + df['sentence1']
    df['target_text'] = df['sentence2']

    return df[['input_text', 'target_text']]

train_df = preprocess_paws(dataset['train']).sample(3000, random_state= 42)
test_df = preprocess_paws(dataset['test']).sample(300, random_state= 42)
validation_df = preprocess_paws(dataset['validation']).sample(300, random_state= 42)

In [4]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
validation_dataset = Dataset.from_pandas(validation_df)

# Loading Model & Tokenization

In [5]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
max_len = max([len(tokenizer.encode(text))for text in train_df['input_text']])

def tokenization(demo):
    inputs = tokenizer(demo['input_text'], max_length= max_len, padding= 'max_length', truncation= True)
    targets = tokenizer(demo['target_text'], max_length= max_len, padding= 'max_length', truncation= True)
    inputs['labels'] = targets['input_ids']
    return inputs

train_dataset = train_dataset.map(tokenization, batched= True)
test_dataset = test_dataset.map(tokenization, batched= True)
validation_dataset = validation_dataset.map(tokenization, batched= True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [6]:
train_dataset[0]

{'input_text': 'paraphrase:William died in 1859 and Elizabeth died the following year .',
 'target_text': 'In 1859 , William and Elizabeth died the following year .',
 '__index_level_0__': 28667,
 'input_ids': [3856,
  27111,
  10,
  518,
  1092,
  23,
  265,
  3977,
  16,
  507,
  3390,
  11,
  9066,
  3977,
  8,
  826,
  215,
  3,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

# TraningArguments of Model

In [7]:
traning_args = TrainingArguments(
    output_dir= './results',
    eval_strategy = 'epoch',
    save_strategy = 'epoch',
    learning_rate = 5e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    weight_decay = 0.01,
    warmup_steps = 500,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    logging_steps = 10,
    logging_dir = './logs'
    )

trainer = Trainer(
    model = model,
    args = traning_args,
    train_dataset = train_dataset,
    eval_dataset = validation_dataset
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marsalbaig5145[0m ([33marsalbaig5145-arsal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.276,0.239749
2,0.1912,0.210659
3,0.1855,0.208882
4,0.1724,0.208094
5,0.1551,0.208468


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1875, training_loss=0.651139306640625, metrics={'train_runtime': 881.6901, 'train_samples_per_second': 17.013, 'train_steps_per_second': 2.127, 'total_flos': 1338042240000000.0, 'train_loss': 0.651139306640625, 'epoch': 5.0})

# Saving Model & Tokenizer

In [8]:
save_dir = './saved_model'
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/spiece.model',
 './saved_model/added_tokens.json')

In [20]:
!zip -r /content/saved_model.zip /content/saved_model/

  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/config.json (deflated 63%)
  adding: content/saved_model/added_tokens.json (deflated 83%)
  adding: content/saved_model/model.safetensors (deflated 8%)
  adding: content/saved_model/special_tokens_map.json (deflated 85%)
  adding: content/saved_model/tokenizer_config.json (deflated 94%)
  adding: content/saved_model/spiece.model (deflated 48%)
  adding: content/saved_model/generation_config.json (deflated 29%)


# Prefixing Input

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model.to(device)
def prefixing_input(demo):

    model = T5ForConditionalGeneration.from_pretrained(save_dir)
    tokenizer = T5Tokenizer.from_pretrained(save_dir)
    return 'paraphrase' + demo

Using device: cuda


# Paraphrase Generation

In [14]:
def generate_paraphrase(inp_text, model, tokenizer, device, max_len= 75, num_beams= 5, num_return_sequences= 4, top_k= 100, top_p= 1, temperature= 2.5):

    model.to(device)
    inp_text = prefixing_input(inp_text)
    inputs = tokenizer(inp_text, return_tensors= 'pt', truncation= True, max_length= max_len, padding= 'max_length')
    inputs = {key : value.to(device) for key, value in inputs.items()} # moves the result to the same device as the model.

    outputs = model.generate(
        input_ids = inputs['input_ids'],
        attention_mask = inputs['attention_mask'],
        max_length = max_len + 20,
        num_beams = num_beams, # Beam Search Decoding Strategy.
        num_return_sequences = num_return_sequences, # no of results.
        top_k = top_k, # model consider the set 'top_k' next words only.
        top_p = top_p, # model selects k likely next words whose cumulative freq exced threshold.
        temperature = temperature, # randomness.
        do_sample = True,
        early_stopping = True
    )

    paraphrase_text = [tokenizer.decode(output, skip_special_tokens= True) for output in outputs]
    return paraphrase_text

# Examples

In [15]:
# Example 1
input_sen = 'The quick brown fox jumps over the lazy dog.'
paraphrase_sen = generate_paraphrase(input_sen, model, tokenizer, device, max_len, num_return_sequences=4)

print(f'Original Sentence: {input_sen}')
for i, paraphrase in enumerate(paraphrase_sen, 1):
    print(f'Paraphrase {i}: {paraphrase}')

Original Sentence: The quick brown fox jumps over the lazy dog.
Paraphrase 1: The quick brown fox jumps over the lazy dog.
Paraphrase 2: The quick brown fox jumps over the lazy dog
Paraphrase 3: The quick brown fox jumps over the lazy dog to the surprise.
Paraphrase 4: The quick brown fox jumps over the lazy dog to the top of the field.


In [16]:
# Example 2
input_sen = 'The dog barked loudly at the stranger outside the house.'
paraphrase_sen = generate_paraphrase(input_sen, model, tokenizer, device, max_len, num_return_sequences= 4)

print(f'Original Sentence: {input_sen}')
for i, paraphrase in enumerate(paraphrase_sen, 1):
    print(f'Paraphrase {i}: {paraphrase}')

Original Sentence: The dog barked loudly at the stranger outside the house.
Paraphrase 1: The dog barked loudly at the stranger outside the house.
Paraphrase 2: The dog barked loudly at the stranger outside the house .
Paraphrase 3: The dog barked loudly at the stranger outside.
Paraphrase 4: The dog barked loudly at the stranger outside the house.
