In [3]:
import torch
from datasets import load_dataset
from torch import GradScaler
from transformers import AutoTokenizer, DataCollatorWithPadding

from Pytorch.CNN import outputs



In [4]:
raw_datasets = load_dataset('glue', 'mrpc')

In [5]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def tokenize_function (example):
    return tokenizer(list(example['sentence1']),list(example['sentence2']), truncation=True, padding = 'max_length')

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [9]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [10]:
raw_datasets['train'].column_names

['sentence1', 'sentence2', 'label', 'idx']

In [11]:
# Now we will do preprocessing before going into the training loop
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])

In [12]:
tokenized_datasets = tokenized_datasets.rename_column('label','labels')# model accepts 'labels' not 'label'

In [13]:
tokenized_datasets.set_format('torch')

In [14]:
tokenized_datasets['train'].column_names # These are the columns that the model will accept

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [15]:
# now we will define teh dataloader of pytorch
from torch.utils.data import DataLoader

In [16]:
train_dataloader = DataLoader(
    tokenized_datasets['train'],shuffle = True, batch_size = 16, collate_fn = data_collator
)

In [17]:
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], batch_size = 16, collate_fn = data_collator
)

In [18]:
# Checking if everything is alright
for batch in train_dataloader:
    break
{k: v.shape for k ,v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 512]),
 'token_type_ids': torch.Size([16, 512]),
 'attention_mask': torch.Size([16, 512])}

In [40]:
# Now we instantiate the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
outputs = model(**batch)

In [42]:
print(outputs.loss, outputs.logits.shape)

tensor(0.5700, grad_fn=<NllLossBackward0>) torch.Size([16, 2])


In [44]:
# here we will use the AdamW opeimizer from pytorch
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5, weight_decay=0.01)

In [47]:
# here we will defice the learning rate scheduler
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)
print(num_training_steps)

690


In [48]:
# I will use the GPU (RTX 3070) here (if you donot have one then donot run this cell)
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [49]:
# now we will make the training Loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update()

  0%|          | 0/690 [00:00<?, ?it/s]

In [51]:
# now we will make the evaluation loop
import evaluate

metric  = evaluate.load('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits= outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions ,references =  batch['labels'])

metric.compute()

{'accuracy': 0.8504901960784313, 'f1': 0.893542757417103}

Now Using Accelerator if there are multiple GPUs

In [19]:
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler

In [20]:
accelerator = Accelerator()

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr  = 3e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
train_dl, eval_dl, model, optimizer = accelerator.prepare(train_dataloader,eval_dataloader,model,optimizer
)

In [23]:
num_epochs = 3
num_training_steps = num_epochs*len(train_dl)
lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)

In [25]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/690 [00:00<?, ?it/s]

In [26]:
model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


In [29]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
import evaluate

In [30]:
metric = evaluate.load('glue', 'mrpc')
model.eval()
for batch in eval_dl:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions = predictions ,references = batch['labels'])

metric.compute()

{'accuracy': 0.8529411764705882, 'f1': 0.8983050847457628}

In [33]:
from transformers import AutoTokenizer

# Replace with your model name if needed
model_name = "bert-base-uncased"

# Save both model and tokenizer
model.save_pretrained("./bert_mrpc_model")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained("./bert_mrpc_model")

print("✅ Model and tokenizer saved to './bert_mrpc_model'")

✅ Model and tokenizer saved to './bert_mrpc_model'


In [37]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("./bert_mrpc_model")
tokenizer = AutoTokenizer.from_pretrained("./bert_mrpc_model")

model.eval()

def check_paraphrase(sentence1, sentence2):
    inputs = tokenizer(sentence1, sentence2, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()
        conf = probs[0, pred].item()

    label = "Paraphrase" if pred == 1 else "Not Paraphrase"
    return f"{label} (confidence: {conf:.2f})"

# Try it
print(check_paraphrase("The boy is playing football.", "I am training LLMs."))


Not Paraphrase (confidence: 0.98)
