---

<div align='center'>
<font size="+2">

Text Mining and Natural Language Processing  
2023-2024

<b>SelectWise</b>

Alessandro Ghiotto 513944

</font>
</div>

---

# Notebook 5 - Evaluation on the Test Set:

Here I look at the performances on the test dataset of the two best methods.

- BERT - combined method
- DeciLM - few-shot prompting

--- 

Data

In [1]:
from datasets import load_dataset
import numpy as np
import random
import torch
from datasets import Dataset

# SEED
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
seed = 8
set_seed(seed)

# DEVICE and DTYPE
mydevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.set_default_device(mydevice) # default tensor device
# torch.set_default_dtype(torch.float32) # default tensor dtype

# DATASET
dataset = load_dataset("allenai/qasc")
n_train_sample = 7323
dataset_train = dataset['train'].select(range(n_train_sample))
dataset_val = dataset['train'].select(range(n_train_sample, len(dataset['train'])))
dataset_test = dataset['validation']

def format_choices(example):
    if example['choices']['label'] == ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
        example['choices'] = example['choices']['text']
    else:
        print("The order of the choices is not the same for all the examples")
    example['answerKey_int'] = ord(example['answerKey']) - 65
    return example

dataset_train = dataset_train.map(format_choices)
dataset_val = dataset_val.map(format_choices)
dataset_test = dataset_test.map(format_choices)

# Display the dataset
dataset_train[0]

{'id': '3E7TUJ2EGCLQNOV1WEAJ2NN9ROPD9K',
 'question': 'What type of water formation is formed by clouds?',
 'choices': ['pearls',
  'streams',
  'shells',
  'diamonds',
  'rain',
  'beads',
  'cooled',
  'liquid'],
 'answerKey': 'F',
 'fact1': 'beads of water are formed by water vapor condensing',
 'fact2': 'Clouds are made of water vapor.',
 'combinedfact': 'Beads of water can be formed by clouds.',
 'formatted_question': 'What type of water formation is formed by clouds? (A) pearls (B) streams (C) shells (D) diamonds (E) rain (F) beads (G) cooled (H) liquid',
 'answerKey_int': 5}

---

# **DeciLM**

`'DeciLM-7B-instruct'`, with few shot prompting. Accuracy on validation dataset = 0.97411.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

model_name = "Deci/DeciLM-7B-instruct"

device = "cuda" 

dtype_kwargs = dict(
    quantization_config=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
))

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    **dtype_kwargs
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=0.1,
    top_p=0.9,
    do_sample=True,
    device_map="auto",
    max_new_tokens=256,
    return_full_text=False
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
def get_response(user_prompt, pipeline=pipe):
    system_prompt = "You are an AI assistant that follows instruction extremely well. Help as much as you can."
    prompt = pipeline.tokenizer.apply_chat_template([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ], tokenize=False, add_generation_prompt=True)
    return pipeline(prompt)[0]['generated_text']

import timeit

def evaluate_model(dataset, question_format, pipeline=pipe):
  total = 0
  correct = 0
  skipped = 0
  wrong_aswers = []
  t0 = timeit.default_timer()
  # cicle over all the reviews
  for i, item in enumerate(dataset):
    true_label = item["answerKey"]
    prompt = question_format(item)
    answer = get_response(prompt, pipeline)
    output_label = answer.upper().replace("\n", " ").strip()[0]
    # output_label = output_label.replace("ANSWER:", "").strip()[0]

    # if the answer is not in the choices, we skip
    if output_label not in ['A','B','C','D','E','F','G','H']:
      print(answer)
      skipped+=1 # counter of skipped sentences
      continue # we simply continue the loop

    if output_label == true_label: # CORRECT
      correct+=1
    else: # WRONG
      wrong_aswers.append((i, output_label))
    total+=1

  delta_t = timeit.default_timer()-t0
  print(f"elapsed : {delta_t:.2f} seconds")
  print(f"elapsed/iter : {delta_t/len(dataset):.5f} seconds")
  print(f"skipped : {skipped}")
  print(f"correct : {correct}")
  print(f"accuracy: {correct/total:.5f}")
  return wrong_aswers


In [7]:
# dataset_train[0] as context

def question_format(item):
    question_formatted = f"""\
    fact1: beads of water are formed by water vapor condensing
    fact2: Clouds are made of water vapor.
    Question: What type of water formation is formed by clouds?
    A) pearls
    B) streams
    C) shells
    D) diamonds
    E) rain
    F) beads
    G) cooled
    H) liquid
    Answer: F

    fact1: {item['fact1']}
    fact2: {item['fact2']}
    Question: {item['question']}
    A) {item['choices'][0]}
    B) {item['choices'][1]}
    C) {item['choices'][2]}
    D) {item['choices'][3]}
    E) {item['choices'][4]}
    F) {item['choices'][5]}
    G) {item['choices'][6]}
    H) {item['choices'][7]}
    Answer:"""
    return question_formatted

wrong_answers = evaluate_model(dataset_test, question_format)

  self.gen = func(*args, **kwds)


elapsed : 206.88 seconds
elapsed/iter : 0.22341 seconds
skipped : 0
correct : 917
accuracy: 0.99028


In [13]:
def print_item(item):
    print(f"""\
    fact1: {item['fact1']}
    fact2: {item['fact2']}
    Question: {item['question']}
    A) {item['choices'][0]}
    B) {item['choices'][1]}
    C) {item['choices'][2]}
    D) {item['choices'][3]}
    E) {item['choices'][4]}
    F) {item['choices'][5]}
    G) {item['choices'][6]}
    H) {item['choices'][7]}
    Answer: {item['answerKey']}""")

for i, output_label in wrong_answers:
    print(f"INDEX: {i}")
    print_item(dataset_test[i])
    print(f"OUTPUT LABEL: {output_label}")
    print("\n\n")

INDEX: 35
    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include jellyfish and anemones.
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: G



INDEX: 103
    fact1: when a gas in an open container evaporates , that gas spreads out into the air
    fact2: Deadly carbon monoxide gas from the generator s exhaust can spread throughout enclosed spaces.
    Question: carbon monoxide gas from a generator's exhaust has been what
    A) transportation
    B) air pollution
    C) projectiles
    D) destroyed crops
    E) destruction
    F) evaporated
    G) danger
    H) Car accidents
    Answer: F
OUTPUT LABEL: G



INDEX: 104
    fact1: Bone is less flexible than cartilage but stronger.
    fact2: Sharks have a cartilage skeleton.
    Question: What are less flexible than shark skeletons but stronger
    A) holding nutrients
    B)

Look at the sample 35

In [14]:
i = 35
print(f"INDEX: {i}")
print_item(dataset_test[i])
print(f"OUTPUT LABEL: {output_label}")

INDEX: 35
    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include jellyfish and anemones.
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: G


I find interesting this example, because given the facts is a very simple question.

fact1: cnidarians -> aquatic  
fact2: jellyfish -> cnidarians  
=> jellyfish -> aquatic

But the model is kind of tricked by the presence of 'anemones' after the word jellyfish.

In [20]:
example = dataset_test[i]
example['fact2'] = 'Cnidarians include anemones and jellyfish'
prompt = question_format(example)
answer = get_response(prompt, pipe)
print_item(example)
print(f"OUTPUT LABEL: {answer}")

    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include anemones and jellyfish
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL:  H


By changing the order of the words 'anemones' and 'jellyfish' I have got the correct answer. Maybe before we got the wrong answer because the sentence "jellyfish are anemones" is a very common pattern (NOUN to be NOUN).

# **BERT**

`'bert-base-uncased'`, first trained with linear probing (all weights freezen except the classifier head), than trained the rest of the architecture (all the weights active except the classifier head). Trained on the train dataset.

In [2]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice

mydevice = 'cuda' if torch.cuda.is_available() else 'cpu'
set_seed(seed)

model_name = 'bert-base-uncased'

# TOKENIZER
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# MODEL
model = AutoModelForMultipleChoice.from_pretrained(f"../models/{model_name}-MultipleChoice-combinedmethod")
model.to(mydevice)

BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [3]:
# PREPROCESS THE DATASET

def preprocess_function_MultipleChoice(examples):
    # attach fact1 and fact2
    # and repeat each sentence 8 times to go with the 8 choices
    first_sentences = [[f"{examples["fact1"][i]} {examples["fact2"][i]}"] * 8 for i in range(len(examples["fact1"]))]
    # Grab all second sentences, the questions.
    questions = examples["question"]
    second_sentences = [
        [f"{question} [SEP] {examples["choices"][i][choice_idx]}" for choice_idx in range(8)] 
        for i, question in enumerate(questions)
    ]

    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten -> each example has 8 choices
    tokenized_examples = {k: [v[i:i+8] for i in range(0, len(v), 8)] for k, v in tokenized_examples.items()}

    # Create the labels
    # ['A','B','C','D','E','F','G','H'] -> [0, 1, 2, 3, 4, 5, 6, 7]
    answerKeys = examples['answerKey'] 
    tokenized_examples['labels'] = [ord(answerKey) - ord('A') for answerKey in answerKeys]

    return tokenized_examples

# Apply the preprocessing function to the dataset
dataset_test_encoded = dataset_test.map(preprocess_function_MultipleChoice, 
                                        batched=True, remove_columns=dataset_test.column_names)

In [4]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

# CREATE THE DATACOLLATOR

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        # take the labels out
        label_name = "labels"
        labels = [feature.pop(label_name) for feature in features]

        # flatten (because now I have a list of 8 choices for each example)
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])

        # pad
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch


In [22]:
from torch.utils.data import DataLoader

# CREATE THE DATALOADER

batch_size = 16
generator = torch.Generator(device=mydevice)
test_dataloader = DataLoader(dataset_test_encoded, batch_size=batch_size, shuffle=False,
                            collate_fn=DataCollatorForMultipleChoice(tokenizer), generator=generator)

In [24]:
import evaluate
import timeit

# EVALUATE ON THE TEST SET

model.eval()

accuracy_metric = evaluate.load("accuracy")

incorrect_indices = []
t0 = timeit.default_timer()
for batch_idx, batch in enumerate(test_dataloader):
    batch = {k: v.to(mydevice) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])

    # Identify incorrect predictions
    incorrect_predictions = predictions != batch["labels"]
    
    # Collect (idx, output_label) of incorrect predictions
    if incorrect_predictions.any():
        incorrect_indices.extend([(batch_idx * len(batch["labels"]) + i, chr(predictions[i].item()+65))
                                  for i, incorrect in enumerate(incorrect_predictions) if incorrect])
        

accuracy_result = accuracy_metric.compute()
delta_t = timeit.default_timer()-t0
print(f"elapsed : {delta_t:.2f} seconds")
print(f"elapsed/iter : {delta_t/len(dataset_test):.5f} seconds")
print(f"Test Accuracy: {accuracy_result['accuracy']: .5f}")

elapsed : 5.77 seconds
elapsed/iter : 0.00623 seconds
Test Accuracy:  0.97192


In [27]:
def print_item(item):
    print(f"""\
    fact1: {item['fact1']}
    fact2: {item['fact2']}
    Question: {item['question']}
    A) {item['choices'][0]}
    B) {item['choices'][1]}
    C) {item['choices'][2]}
    D) {item['choices'][3]}
    E) {item['choices'][4]}
    F) {item['choices'][5]}
    G) {item['choices'][6]}
    H) {item['choices'][7]}
    Answer: {item['answerKey']}""")

for i, output_label in incorrect_indices:
    print(f"INDEX: {i}")
    print_item(dataset_test[i])
    print(f"OUTPUT LABEL: {output_label}")
    print("\n")

INDEX: 11
    fact1: a vehicle is used for transportation
    fact2: Cars and busses are both examples of vehicles.
    Question: What are busses used for?
    A) Protective shelter
    B) Transporting humans
    C) help other species benefit
    D) Transporting airplanes
    E) A backbone
    F) Communication
    G) safe operation
    H) safe driving
    Answer: B
OUTPUT LABEL: H


INDEX: 35
    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include jellyfish and anemones.
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: G


INDEX: 106
    fact1: Bone is less flexible than cartilage but stronger.
    fact2: Most fish have bony skeletons, but sharks have skeletons made of cartilage .
    Question: What has a less flexible but stronger skeleton compared to sharks?
    A) fish
    B) Mohs
    C) Type O
    D) bacteria
    E) cart

Also BERT have done the same error that we have seen before with the LLM.

In [64]:
i = 35
for j, output_label in incorrect_indices:
    if j == i:
        output_label_i = output_label
print(f"INDEX: {i}")
print_item(dataset_test[i])
print(f"OUTPUT LABEL: {output_label_i}")

INDEX: 35
    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include jellyfish and anemones.
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: G


In [67]:
import torch

example = dataset_test[i]
example['fact2'] = 'Cnidarians include anemones and jellyfish'

def pipeline_multiplechoice(example):
    model.eval()

    first_sentences = [f"{example['fact1']} {example['fact2']}"] * 8
    second_sentences = [f"{example['question']} [SEP] {example['choices'][choice_idx]}" for choice_idx in range(8)] 
    # Tokenize
    tokenized_example = tokenizer(first_sentences, second_sentences, truncation=True, 
                                   padding=True, return_tensors="pt")

    batch_size = 1
    num_choices = 8
    input = {k: v.view(batch_size, num_choices, -1).to('cuda') for k, v in tokenized_example.items()}

    with torch.no_grad():
        outputs = model(**input)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    return chr(predictions[0].item() + 65)

print_item(example)
print(f"OUTPUT LABEL: {pipeline_multiplechoice(example)}")

    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include anemones and jellyfish
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: G


Unlike DeciLM, the BERT model still does the same error, even if I have swapped 'anemones' and 'jellyfish'.

In [70]:
example = dataset_test[i]
example['fact2'] = 'Cnidarians include jellyfish'

print_item(example)
print(f"OUTPUT LABEL: {pipeline_multiplechoice(example)}")

    fact1: All cnidarians are aquatic.
    fact2: Cnidarians include jellyfish
    Question: What kind of animal are jellyfish?
    A) protozoa
    B) adult
    C) paramecium
    D) dry
    E) land-based
    F) Porifera
    G) anemones
    H) aquatic
    Answer: H
OUTPUT LABEL: H


And we know that the erros is given by composing wrongly the facts. It's not given by some prior wrong information learned by the model during the pretraining. Because by removing the word 'anemones' the model classify correctly the answer.

---

# **Results**

| Metric                         |bert-base-uncased           |DeciLM-7B-instruct    |
|--------------------------------|:--------------------------:|:--------------------:|
| Test Accuracy                  | $0.97192$                  | $\bf{0.99028}$       |
| Time Elapsed (seconds)         | $\bf{5.77}$                | $208.88$             |
| avg time elapsed for one sample| $\bf{6.23 \times 10^{-3}}$ | $2.23 \times 10^{-1}$|

The accuracy obtained with the LLM with few-shot promptig is a bit higher, but the time required is considerably higher. Instead BERT is incredibly fast.
