# Paraphrase classification

Watch the following course and finetune a model to build a paraphrase classifier [Fine-tuning a pretrained model](https://huggingface.co/learn/nlp-course/chapter3/1?fw=pt)

from videos

In [None]:
!pip install datasets evaluate transformers[sentencepiece]



In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])

In [None]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [9]:
import pandas as pd

data = {
    'sentence1': ["The quick brown fox jumps over the lazy dog.", "A fast, dark-colored fox leaps over a sleepy dog."],
    'sentence2': ["A speedy fox jumps over the dog.", "A quick, brown fox jumps over the lazy dog."],
    'label': [1, 1]  # 1 for paraphrases, 0 for non-paraphrases
}

df = pd.DataFrame(data)


In [10]:
from datasets import Dataset
from transformers import AutoTokenizer

dataset = Dataset.from_pandas(df)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding='max_length', truncation=True)

In [12]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import Trainer, TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# Train the model
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.9821305274963379, metrics={'train_runtime': 57.8092, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.052, 'total_flos': 1578666332160.0, 'train_loss': 0.9821305274963379, 'epoch': 3.0})

In [15]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.949331521987915, 'eval_runtime': 6.8248, 'eval_samples_per_second': 0.293, 'eval_steps_per_second': 0.147, 'epoch': 3.0}


pretrained

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define the input sentences
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast, dark-colored fox leaps over a sleepy dog."

# Tokenize the sentences
inputs = tokenizer(sentence1, sentence2, return_tensors='pt', padding=True, truncation=True)

# Get the model's output
outputs = model(**inputs)
logits = outputs.logits

# Apply softmax to get probabilities
probs = torch.softmax(logits, dim=1)
paraphrase_prob = probs[0][1].item()
non_paraphrase_prob = probs[0][0].item()

# Print the results
print(f"Paraphrase probability: {paraphrase_prob}")
print(f"Non-paraphrase probability: {non_paraphrase_prob}")

# Determine if the sentences are paraphrases
if paraphrase_prob > non_paraphrase_prob:
    print("The sentences are paraphrases.")
else:
    print("The sentences are not paraphrases.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Paraphrase probability: 0.4411696791648865
Non-paraphrase probability: 0.5588303804397583
The sentences are not paraphrases.
