#Evaluation of Pretrained Model (NSP and MLM tasks for BERT)

In [None]:
!pip install -U accelerate
!pip install transformers
!pip install datasets

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import random

In [2]:
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')

In [3]:
datasets_cleaned = datasets.filter(lambda example: len(example['text'])  > 0 and not example["text"].startswith(" ="))

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Dhairya/nlp-bert-wordWeavers-tokenizer")

In [5]:
def tokenization(example):
    return tokenizer(example["text"], add_special_tokens=False)

datasets_tokenized = datasets_cleaned.map(
    tokenization, batched=True, num_proc=4)

In [6]:
datasets_tokenized = datasets_tokenized.map(batched= True, remove_columns='text')

In [7]:
def concatenation(example):
    BLOCK_SIZE= 255
    # concatenating the inputs_ids, token_ids, attention_mask respectively to a list to create a single list of all the tokens
    concatenated_examples = {}
    for keys in example.keys(): # inputs_ids, token_ids, attention_mask
        concatenated_examples[keys] = sum(example[keys], [])

    # total length same across all the keys
    n = len(concatenated_examples[list(example.keys())[0]])
    n = (n//BLOCK_SIZE) * BLOCK_SIZE

    # breaking the total combined list to get BLOCK SIZE chunks
    result = {}
    for keys, token_type in concatenated_examples.items():
        result[keys] = []
        for i in range(0, n, BLOCK_SIZE):
            result[keys].append(token_type[i: i+BLOCK_SIZE])
    return result

datasets_block_size = datasets_tokenized.map(
    concatenation, batched=True, batch_size=1000, num_proc=4)

In [8]:
def preparing_NSP_dataset(example, ind, dataset, n):
    # NSP dataset is created such that 50% sentence 2 comes after sentence 1. Rest 50% it is random
    sent_1 = example['input_ids']
    attention_mask = [1] * 512
    next_sentence_label = 1

    if ind % 2 == 0:
        next_ind = ind + 1
        if next_ind < len(dataset['input_ids']):
            sent_2 = dataset['input_ids'][next_ind]
        else: # last sentence has no next sentence
            next_ind = random.randint(0, n-1)
            sent_2 = dataset['input_ids'][next_ind]
            next_sentence_label = 0

    else:
        next_sentence_label = 0
        next_ind = random.randint(0, n-1) # randomly choosing the next index
        if next_ind == ind + 1: # if randomly choosed the next sentence then changing the next sentence label
            next_sentence_label = 1
        sent_2 = dataset['input_ids'][next_ind]

    # input  =  [cls] + sent1 + [sep] + sent2
    input_ids = [tokenizer.cls_token_id] + sent_1 + [tokenizer.sep_token_id] + sent_2
    token_type_ids = [0] * (257) + [1] * (255)
    attention_mask = [1] + example['attention_mask'] + [1] + dataset[next_ind]['attention_mask']

    return {
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask,
        'next_sentence_label': next_sentence_label
    }

In [9]:
dataset_validation = datasets_block_size['validation']
dataset_test = datasets_block_size['test']

In [11]:
dataset_NSP_test = dataset_test.map(
    lambda example, ind: preparing_NSP_dataset(example, ind, dataset_test, n =len(dataset_test)),with_indices=True, num_proc=32)

In [13]:
from transformers import BertConfig, BertForPreTraining, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from torch.utils.data import DataLoader

In [14]:
pretrained_model = BertForPreTraining.from_pretrained("Dhairya/nlp-bert-wordWeavers-pretrained")
collater_pt = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, return_tensors="pt"
)

In [16]:
from transformers import Trainer, TrainingArguments

# Define the Trainer and TrainingArguments
trainer = Trainer(
    model=pretrained_model,  # your pretrained model
    args=TrainingArguments(
        output_dir="./results",  # directory to save results
        per_device_eval_batch_size=16,
    ),
    data_collator=collater_pt,  # your data collator
)
# Evaluate on the validation set (replace with your test dataset if needed)
results = trainer.evaluate(eval_dataset=dataset_NSP_test)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 8.028535842895508, 'eval_runtime': 3107.5866, 'eval_samples_per_second': 0.342, 'eval_steps_per_second': 0.022}


In [37]:
print(results)

{'eval_loss': 8.028535842895508, 'eval_runtime': 3107.5866, 'eval_samples_per_second': 0.342, 'eval_steps_per_second': 0.022}


In [38]:
import math
log_perplexity = results['eval_loss']
perplexity = math.exp(log_perplexity)

print("Log-Perplexity on Test Dataset: ", log_perplexity)
print("Perplexity on Test Dataset: ", perplexity)

Log-Perplexity on Test Dataset:  8.028535842895508
Perplexity on Test Dataset:  3067.247451804034


#Evaluation of Finetuned Model (Classification on SST2 dataset)

In [None]:
!pip install transformers
!pip install datasets

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer, BertTokenizer, BertForSequenceClassification, EarlyStoppingCallback

In [None]:
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

sst2_dataset = load_dataset('glue', 'sst2')

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("Dhairya/bert-wordWeavers-ft-sst2", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("Dhairya/nlp-bert-wordWeavers-tokenizer")

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from tqdm import tqdm


# Define a DataLoader for the test dataset
batch_size = 32  # Adjust as needed
test_dataloader = DataLoader(sst2_dataset['validation'], batch_size=batch_size, shuffle=False)

# Set the model to evaluation mode
model.eval()

# Lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Iterate through the test dataset and make predictions
for batch in tqdm(test_dataloader, desc="Evaluating"):
    inputs = tokenizer(batch["sentence"], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits

    # Get predicted labels
    preds = torch.argmax(logits, dim=1).tolist()
    predicted_labels.extend(preds)

    # Get true labels
    true_labels.extend(batch["label"].tolist())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average="weighted")
recall = recall_score(true_labels, predicted_labels, average="weighted")
f1 = f1_score(true_labels, predicted_labels, average="weighted")

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Evaluating: 100%|██████████| 28/28 [02:15<00:00,  4.85s/it]

Accuracy: 0.5092
Precision: 0.2593
Recall: 0.5092
F1 Score: 0.3436



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.5092
Precision: 0.2593
Recall: 0.5092
F1 Score: 0.3436


#Evaluation of Finetuned Model (Question-Answering on SQuAD)

In [17]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained("Dhairya/bert-wordweavers-ft-squad")

In [18]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("Dhairya/nlp-bert-wordweavers-tokenizer")

In [19]:
from datasets import load_dataset
squad = load_dataset("squad_v2")

In [20]:
from datasets import load_dataset, concatenate_datasets
complete_data = concatenate_datasets([squad['train'], squad['validation']])

In [21]:
def filter_no_answers(example):
    return bool(example['answers']['text'])
def filter_no_question(example):
    return bool(example['question'])
# Filter out examples with no answers
filtered_dataset = complete_data.filter(filter_no_answers)
filtered_dataset = filtered_dataset.filter(filter_no_question)
complete_data = filtered_dataset

In [22]:
dataset_train_test = complete_data.class_encode_column("title").train_test_split(test_size=0.2, stratify_by_column="title", seed = 1)

In [23]:
!pip install rouge_score



In [24]:
from datasets import list_metrics
from datasets import load_metric

metric_sq_v2 = load_metric('squad_v2')
metric_bleu = load_metric('bleu')
metric_meteor = load_metric('meteor')
metric_rouge = load_metric('rouge')

  metric_sq_v2 = load_metric('squad_v2')
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model=model, tokenizer = tokenizer)

In [26]:
!pip install tqdm
from tqdm import tqdm



The code below will test for 400 samples from the test set. Set the MAX_EXAMPLES_TO_TEST variable to equal the num_testing_examples in order to test on the full test set (very time consuming)

In [None]:
predictions = []
references = []
predicted_answers = []
reference_answers = []
count = 0
num_testing_examples = dataset_train_test['test'].num_rows

MAX_EXAMPLES_TO_TEST = 400

#Uncomment this variable to a smaller value like num_testing_examples/10 if you want to evaluate on a subset of the test set.
# MAX_EXAMPLES_TO_TEST = num_testing_examples

for example in dataset_train_test['test']:
    count = count + 1
    if(count>MAX_EXAMPLES_TO_TEST):
      # print(count)
      break
    context = example["context"]
    question = example["question"]
    expected_answer_list = example['answers']['text']

    if len(expected_answer_list) == 0:
        expected_answer = ''
    else:
        expected_answer = expected_answer_list[0]
    reference_answers.append(expected_answer)
    predicted_answer = question_answerer(question=question, context=context)
    predicted_answers.append(predicted_answer['answer'])
    predictions.append({'prediction_text': predicted_answer, 'id': example['id'], 'no_answer_probability': 0})

    # Append references in the required format
    references.append({'answers': {'answer_start': [context.find(expected_answer)], 'text': [expected_answer]}, 'id': example['id']})
    print(f"Evaluating- {count}/{MAX_EXAMPLES_TO_TEST} complete")

In [30]:
import nltk

def find_met(ref, pred):
  res = 0
  for i in range(len(ref)):
    res += nltk.translate.meteor_score.meteor_score([ref[i].split()], pred[i].split(), gamma=1)
  res = res/len(ref)
  return res

In [31]:
import nltk
nltk.download('wordnet')
!pip install rouge

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [32]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from sklearn.metrics import accuracy_score

# Calculate BLEU score
bleu_score = corpus_bleu([[ref.split()] for ref in reference_answers], [pred.split() for pred in predicted_answers])

# Calculate METEOR score
meteor_score = find_met(reference_answers, predicted_answers)

predicted_answers = ["  " if not answer else answer for answer in predicted_answers]
# Calculate ROUGE score
rouge = Rouge()
rouge_score = rouge.get_scores(predicted_answers, reference_answers, avg=True)

# Calculate exact-match score
exact_match = accuracy_score(reference_answers, predicted_answers)

# Calculate F1 score
f1_score = 2 * (exact_match * bleu_score) / (exact_match + bleu_score)

In [33]:
results_sq_v2 = metric_sq_v2.compute(predictions=predictions, references=references)

In [34]:
print(f'F1 Score: {f1_score:.4f}')
print(f'BLEU Score: {bleu_score:.4f}')
print(f'Meteor Score: {meteor_score:.4f}')
print(f'Exact Match Score: {exact_match:.4f}')

F1 Score: 0.0092
BLEU Score: 0.0118
Meteor Score: 0.0195
Exact Match Score: 0.0075


In [35]:
print("Rouge scores: ",rouge_score)

Rouge scores:  {'rouge-1': {'r': 0.09190275228998057, 'p': 0.04448469585969586, 'f': 0.04664135568935447}, 'rouge-2': {'r': 0.03417839105339105, 'p': 0.01077888777888778, 'f': 0.013060369158305038}, 'rouge-l': {'r': 0.09190275228998057, 'p': 0.04448469585969586, 'f': 0.04664135568935447}}


In [36]:
print("SQuAD v2 scores: ", results_sq_v2)

SQuAD v2 scores:  {'exact': 0.0, 'f1': 2.7265609279686203, 'total': 400, 'HasAns_exact': 0.0, 'HasAns_f1': 2.7265609279686203, 'HasAns_total': 400, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 2.7265609279686203, 'best_f1_thresh': 0.0}
