In [1]:
! pip install -q datasets evaluate transformers rouge-score nltk

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
import transformers 

print(transformers.__version__)



4.36.2


In [3]:
model_checkpoint = "t5-small"

In [4]:
from evaluate import load
metric = load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/KAILASHVenkat/Paraphrasing_model/main/filtered_data.csv')
df.shape

(9389, 2)

In [6]:
max_length_input_text = df['input_text'].str.len().max()
max_length_target_text = df['target_text'].str.len().max()
print(max_length_input_text)
print(max_length_target_text)

642
573


In [7]:
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)

# Step 2: Split the temp data into validation and test sets (50% validation, 50% test)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data.reset_index(drop=True, inplace=True)
validation_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

# Step 3: Save the datasets under the variable name raw_datasets
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train_data[['input_text', 'target_text']]),
    'validation': Dataset.from_pandas(validation_data[['input_text', 'target_text']]),
    'test': Dataset.from_pandas(test_data[['input_text', 'target_text']])
})

In [8]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 7511
    })
    validation: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 939
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 939
    })
})

In [9]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [10]:
show_random_elements(raw_datasets["train"])

Unnamed: 0,input_text,target_text
0,"admiration to approval: However, the forward has also received praise from outside the Stadio Renzo Barbera.","Yet, the forward has also been recognized positively by those outside the Stadio Renzo Barbera."
1,gratitude to approval: The success of the Ofeq program has made Israel one of seven countries capable of launching such satellites.,The success of the Ofeq program has made Israel one of seven countries capable of launching such satellites.
2,confusion to curiosity: Are most conspiracy theories themselves conspiracies?,Are all theories about conspiracies conspiracy theories?
3,confusion to curiosity: Why does keyboard keys are random and not in alphabetical order?,Why aren't the letters on the keyboard in alphabetical order?
4,confusion to curiosity: Why it is necessary to do MBA after engineering?,Is it good to pursue MBA after engineering?


In [11]:
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [12]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [13]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "paraphrase: "
else:
    prefix = ""

In [14]:
max_input_length = 650
max_target_length = 580

def preprocess_function(examples):
    inputs = [doc for doc in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["target_text"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [15]:
preprocess_function(raw_datasets['train'][:2])

{'input_ids': [[17142, 12, 18967, 10, 571, 186, 1440, 33, 132, 16, 8, 296, 58, 1333, 55, 1], [3922, 12, 5142, 10, 12433, 11895, 29, 31, 7, 4806, 6, 15364, 6, 243, 160, 384, 47, 10693, 28, 2089, 31, 7, 23173, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[571, 186, 1440, 33, 132, 58, 1], [2150, 12, 15364, 6, 12433, 11895, 29, 31, 7, 4806, 6, 2089, 31, 7, 23173, 1940, 248, 5044, 12, 70, 384, 5, 1]]}

In [16]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
batch_size = 8
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-T5",
    evaluation_strategy = "epoch",
    learning_rate=7e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [20]:
import nltk
import numpy as np
from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu, corpus_bleu

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    # Calculate BLEU score
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu([[ref.split()] for ref in decoded_labels], [pred.split() for pred in decoded_preds], smoothing_function=smoothing)
    
    # Calculate Exact Sentence-level Recall (Exact SR) and Exact F1 (Exact FE)
    exact_sr = sum([1 for label, pred in zip(decoded_labels, decoded_preds) if label == pred]) / len(decoded_labels)
    exact_fe = 2 * (exact_sr * bleu_score) / (exact_sr + bleu_score) if (exact_sr + bleu_score) > 0 else 0.0

    # ROUGE scores (existing code)
    rouge_output = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    rouge_scores = {key: value * 100 for key, value in rouge_output.items()}

    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    gen_len = np.mean(prediction_lens)

    result = {
        "gen_len": gen_len,
        "bleu": bleu_score * 100,
        "exact_sr": exact_sr * 100,
        "exact_fe": exact_fe * 100,
        **rouge_scores,
    }

    return {k: round(v, 4) for k, v in result.items()}

In [21]:
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)

In [22]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Gen Len,Bleu,Exact Sr,Exact Fe,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,1.787308,14.0224,25.2184,5.1118,8.5006,57.7313,36.6448,54.0227,54.2219
2,2.040800,1.638021,14.1565,25.8662,5.3248,8.8316,57.9187,37.2205,54.1583,54.3375
3,1.556500,1.541077,14.2375,26.0557,5.5378,9.1342,58.1161,37.1592,54.1156,54.2697
4,1.190200,1.524991,14.4345,26.7641,5.1118,8.5841,59.07,38.6348,55.1924,55.3979
5,0.859400,1.58614,14.524,27.7158,6.0703,9.9593,60.0628,39.6954,56.0108,56.2041




TrainOutput(global_step=2350, training_loss=1.297681358824385, metrics={'train_runtime': 547.3652, 'train_samples_per_second': 68.611, 'train_steps_per_second': 4.293, 'total_flos': 522769867800576.0, 'train_loss': 1.297681358824385, 'epoch': 5.0})

In [23]:
import torch

In [25]:
import torch

# Assuming tokenized_datasets["test"] contains your test dataset
sample_input = tokenized_datasets["test"][5]

# Tokenize the input
tokenized_input = tokenizer(sample_input["input_text"], return_tensors="pt", max_length=max_input_length, truncation=True)

# Move input tensors to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenized_input = {key: value.to(device) for key, value in tokenized_input.items()}

# Generate output
with torch.no_grad():
    generated_output = model.generate(
        **tokenized_input,
        max_length=400,  # Set the desired maximum length
        num_beams=4,     # You can adjust the number of beams for diverse outputs
    )

# Postprocess the Output
decoded_output = tokenizer.batch_decode(generated_output, skip_special_tokens=True)[0]

# Print input text and generated output
print("Input Text:")
print(sample_input["input_text"])
print("\nGenerated Output:")
print(decoded_output)


Input Text:
confusion to curiosity: How do I know if a girl likes me back or not?

Generated Output:
How do I know if this girl likes me?


In [26]:
model.save_pretrained("path/to/save/kaggle/working/")
tokenizer.save_pretrained("path/to/save/kaggle/working/")

('path/to/save/kaggle/working/tokenizer_config.json',
 'path/to/save/kaggle/working/special_tokens_map.json',
 'path/to/save/kaggle/working/spiece.model',
 'path/to/save/kaggle/working/added_tokens.json',
 'path/to/save/kaggle/working/tokenizer.json')

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the saved model and tokenizer
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("path/to/save/kaggle/working/")
loaded_tokenizer = AutoTokenizer.from_pretrained("path/to/save/kaggle/working/")