<a href="https://colab.research.google.com/github/AliEbadi110/Natural-Language-Processing-Text-Classification-Sample-Projects/blob/main/08_NLP_Transformers_Sentence_Entailment_Glue_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLP - Transformers - Sentence Entailment - Glue Dataset**

In [None]:
!pip install datasets
!pip install transformers[torch]

In [None]:
import numpy as np
import torch
from sklearn.metrics import classification_report, confusion_matrix

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

## 1. Loading Data

In [None]:
raw_datasets = load_dataset('glue', 'rte')

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/697k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2490 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/277 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [None]:
raw_datasets['train'][0]

{'sentence1': 'No Weapons of Mass Destruction Found in Iraq Yet.',
 'sentence2': 'Weapons of Mass Destruction Found in Iraq.',
 'label': 1,
 'idx': 0}

In [None]:
raw_datasets['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'not_entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

## 2. Preprocessing

In [None]:
checkpoint = 'bert-base-cased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenizer(raw_datasets['train'][0]['sentence1'], raw_datasets['train'][0]['sentence2'], truncation=True)

{'input_ids': [101, 1302, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 6355, 119, 102, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def tokenize_func(example):
  return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_func, batched=True)
tokenized_datasets

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3000
    })
})

## 3. Train Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
metric1 = load_metric('accuracy')
metric2 = load_metric('f1')

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
  f1 = metric2.compute(predictions=predictions, references=labels)["f1"]

  return {"accuracy": accuracy, "f1": f1}

  metric1 = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments('trainer_dir',
                                  per_device_train_batch_size=16,
                                  evaluation_strategy='epoch',
                                  save_strategy='epoch',
                                  num_train_epochs=3,
                                  logging_steps=200,
                                  load_best_model_at_end=True,
                                  )

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.618665,0.649819,0.561086
2,0.619400,0.767496,0.66787,0.666667
3,0.296300,0.926134,0.67509,0.637097


TrainOutput(global_step=468, training_loss=0.4186351197397607, metrics={'train_runtime': 259.7704, 'train_samples_per_second': 28.756, 'train_steps_per_second': 1.802, 'total_flos': 644912946681480.0, 'train_loss': 0.4186351197397607, 'epoch': 3.0})

## 4. Evaluate

In [None]:
trainer.evaluate()

{'eval_loss': 0.6186646819114685,
 'eval_accuracy': 0.6498194945848376,
 'eval_f1': 0.5610859728506788,
 'eval_runtime': 2.6052,
 'eval_samples_per_second': 106.325,
 'eval_steps_per_second': 13.435,
 'epoch': 3.0}

In [None]:
val_predictions = trainer.predict(tokenized_datasets['validation'])

In [None]:
val_preds = torch.nn.functional.softmax(torch.Tensor(val_predictions.predictions), dim=-1)

In [None]:
val_preds = torch.argmax(val_preds, axis=-1)

In [None]:
print(classification_report(raw_datasets['validation']['label'], val_preds.numpy()))

              precision    recall  f1-score   support

           0       0.63      0.81      0.71       146
           1       0.69      0.47      0.56       131

    accuracy                           0.65       277
   macro avg       0.66      0.64      0.63       277
weighted avg       0.66      0.65      0.64       277



In [None]:
print(confusion_matrix(raw_datasets['validation']['label'], val_preds.numpy()))

[[118  28]
 [ 69  62]]
