In [1]:
import torch
from datasets import load_dataset
from torch import GradScaler
from transformers import AutoTokenizer, DataCollatorWithPadding

from Pytorch.CNN import outputs

In [13]:
raw_datasets = load_dataset('glue', 'mrpc')

In [14]:
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [26]:
def tokenize_function (example):
    return tokenizer(list(example['sentence1']),list(example['sentence2']), truncation=True, padding = 'max_length')

In [27]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [28]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [29]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [30]:
raw_datasets['train'].column_names

['sentence1', 'sentence2', 'label', 'idx']

In [31]:
# Now we will do preprocessing before going into the training loop
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])

In [32]:
tokenized_datasets = tokenized_datasets.rename_column('label','labels')# model accepts 'labels' not 'label'

In [33]:
tokenized_datasets.set_format('torch')

In [34]:
tokenized_datasets['train'].column_names # These are the columns that the model will accept

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [35]:
# now we will define teh dataloader of pytorch
from torch.utils.data import DataLoader

In [36]:
train_dataloader = DataLoader(
    tokenized_datasets['train'],shuffle = True, batch_size = 16, collate_fn = data_collator
)

In [37]:
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], batch_size = 16, collate_fn = data_collator
)

In [38]:
# Checking if everything is alright
for batch in train_dataloader:
    break
{k: v.shape for k ,v in batch.items()}

{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 512]),
 'token_type_ids': torch.Size([16, 512]),
 'attention_mask': torch.Size([16, 512])}

In [40]:
# Now we instantiate the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
outputs = model(**batch)

In [42]:
print(outputs.loss, outputs.logits.shape)

tensor(0.5700, grad_fn=<NllLossBackward0>) torch.Size([16, 2])


In [44]:
# here we will use the AdamW opeimizer from pytorch
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5, weight_decay=0.01)

In [47]:
# here we will defice the learning rate scheduler
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps
)
print(num_training_steps)

690


In [48]:
# I will use the GPU (RTX 3070) here (if you donot have one then donot run this cell)
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [49]:
# now we will make the training Loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update()

  0%|          | 0/690 [00:00<?, ?it/s]

In [51]:
# now we will make the evaluation loop
import evaluate

metric  = evaluate.load('glue', 'mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits= outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions ,references =  batch['labels'])

metric.compute()

{'accuracy': 0.8504901960784313, 'f1': 0.893542757417103}