#ECE 592: Course Project
---
##Exploring and Fine-tuning Extractive and Generative LLMs for Question Answering

---

Name: Atharva Pansare
<br>
Unity ID: aspansar
<br>
Student ID: 200535507

## Base-BERT (without Fine-Tuning)

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, precision_score, f1_score

# Load the BoolQ dataset
boolq = load_dataset("super_glue", "boolq")

# Load the base BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Function to run inference
def run_inference(model, tokenizer, question, passage):
    inputs = tokenizer(question, passage, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Evaluate on the validation set
true_labels = []
predicted_labels = []

for example in boolq['validation']:
    question = example['question']
    passage = example['passage']
    true_label = example['label']

    predicted_label = run_inference(model, tokenizer, question, passage)

    true_labels.append(true_label)
    predicted_labels.append(predicted_label)

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")

README.md:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

super_glue.py:   0%|          | 0.00/30.7k [00:00<?, ?B/s]

The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/4.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3245 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.3783
Precision: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Fine-Tuning BERT

In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader

# Load the BoolQ dataset
boolq_dataset = load_dataset('boolq')

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess function
def preprocess_boolq(examples):
    questions = examples['question']
    passages = examples['passage']

    # Tokenize inputs
    inputs = tokenizer(
        questions,
        passages,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    # Convert boolean labels to integers
    labels = [1 if label else 0 for label in examples['answer']]

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'token_type_ids': inputs['token_type_ids'],
        'labels': labels
    }

# Preprocess the data
train_dataset = boolq_dataset['train'].map(
    preprocess_boolq,
    batched=True,
    remove_columns=boolq_dataset['train'].column_names
)

# Convert to PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

# Create DataLoader
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model (now using BertForSequenceClassification for binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Training loop (simplified)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs} completed")

# Save the model
model.save_pretrained('./boolq_bert_model')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 completed
Epoch 2/3 completed
Epoch 3/3 completed


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the validation dataset
val_dataset = boolq_dataset['validation'].map(
    preprocess_boolq,
    batched=True,
    remove_columns=boolq_dataset['validation'].column_names
)
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'labels'])
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    return accuracy, precision, recall, f1

# Evaluate the model
accuracy, precision, recall, f1 = evaluate_model(model, val_dataloader)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Inference function
def run_inference(question, passage):
    inputs = tokenizer(question, passage, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][predicted_class].item()

    return "Yes" if predicted_class == 1 else "No", confidence

# Example usage of inference
question = "Is the sky blue?"
passage = "The sky appears blue to the human eye as the result of a phenomenon called Rayleigh scattering."
answer, confidence = run_inference(question, passage)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Confidence: {confidence:.2f}")

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Accuracy: 0.7046
Precision: 0.7751
Recall: 0.7393
F1 Score: 0.7568
Question: Is the sky blue?
Answer: Yes
Confidence: 0.75


### Samples for Inference: BERT

In [None]:
# More examples for inference
examples = [
    {
        "question": "Is Mount Everest the tallest mountain in the world?",
        "passage": "Mount Everest is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. The China–Nepal border runs across its summit point."
    },
    {
        "question": "Do plants perform photosynthesis at night?",
        "passage": "Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy. The process primarily occurs during daylight hours, as it requires sunlight to function."
    },
    {
        "question": "Was Albert Einstein born in Germany?",
        "passage": "Albert Einstein was born on March 14, 1879, in Ulm, Germany. He was a theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics."
    },
    {
        "question": "Is the Great Wall of China visible from space?",
        "passage": "Contrary to popular belief, the Great Wall of China is not visible from space with the naked eye. It can be seen from low Earth orbit under certain conditions, but it's not uniquely visible compared to other human-made structures."
    },
    {
        "question": "Are all species of sharks dangerous to humans?",
        "passage": "While sharks have a reputation as dangerous predators, only a few species of sharks are known to pose a significant threat to humans. Out of more than 500 species of sharks, only about a dozen have been involved in attacks on humans."
    }
]

# Run inference on each example
for example in examples:
    question = example["question"]
    passage = example["passage"]
    answer, confidence = run_inference(question, passage)
    print(f"Question: {question}")
    print(f"Passage: {passage}")
    print(f"Answer: {answer}")
    print(f"Confidence: {confidence:.2f}")
    print("-" * 50)

Question: Is Mount Everest the tallest mountain in the world?
Passage: Mount Everest is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. The China–Nepal border runs across its summit point.
Answer: No
Confidence: 0.71
--------------------------------------------------
Question: Do plants perform photosynthesis at night?
Passage: Photosynthesis is a process used by plants and other organisms to convert light energy into chemical energy. The process primarily occurs during daylight hours, as it requires sunlight to function.
Answer: Yes
Confidence: 0.65
--------------------------------------------------
Question: Was Albert Einstein born in Germany?
Passage: Albert Einstein was born on March 14, 1879, in Ulm, Germany. He was a theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics.
Answer: Yes
Confidence: 0.88
--------------------------------------------------
Question: Is the Great Wall 

## Fine-Tuning T5

In [None]:
from datasets import load_dataset

# Load the BoolQ dataset
dataset = load_dataset("boolq")

# Display the first few rows of the dataset
print(dataset['train'].to_pandas().head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

                                            question  answer  \
0    do iran and afghanistan speak the same language    True   
1  do good samaritan laws protect those who help ...    True   
2  is windows movie maker part of windows essentials    True   
3  is confectionary sugar the same as powdered sugar    True   
4         is elder scrolls online the same as skyrim   False   

                                             passage  
0  Persian (/ˈpɜːrʒən, -ʃən/), also known by its ...  
1  Good Samaritan laws offer legal protection to ...  
2  Windows Movie Maker (formerly known as Windows...  
3  Powdered sugar, also called confectioners' sug...  
4  As with other games in The Elder Scrolls serie...  


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Initialize the T5 tokenizer and model (T5-small in this case)
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Preprocessing the dataset: Prepare input-output pairs for T5
def preprocess_function(examples):
    inputs = [f"Question: {question}  Passage: {passage}" for question, passage in zip(examples['question'], examples['passage'])]
    targets = ['true' if answer else 'false' for answer in examples['answer']]

    # Tokenize inputs and outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=10, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Preprocess the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3270 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize the Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,16.5873
20,13.2062
30,11.2439
40,7.8223
50,6.205
60,4.7855
70,3.4796
80,3.0077
90,2.7863
100,2.566


TrainOutput(global_step=7071, training_loss=0.2024255228822748, metrics={'train_runtime': 1445.9567, 'train_samples_per_second': 19.559, 'train_steps_per_second': 4.89, 'total_flos': 3827601487429632.0, 'train_loss': 0.2024255228822748, 'epoch': 3.0})

### Model Evaluation: T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./results/checkpoint-3500")

# Function to run inference and calculate metrics
def run_inference_and_evaluate(model, tokenizer, eval_dataset):
    decoded_preds = []
    decoded_labels = []

    # Iterate over the validation dataset and run inference for each example
    for example in eval_dataset:
        # Prepare the input for the model (question + context)
        input_text = f"question: {example['question']} context: {example['passage']}"

        # Tokenize the input
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids

        # Generate the answer using the model
        output_ids = model.generate(input_ids)

        # Decode the predicted answer
        predicted_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip().lower()

        # Convert the true answer (True/False) to a string ('true'/'false') for comparison
        true_answer = "true" if example['answer'] else "false"

        # Append the predicted and true answers to lists
        decoded_preds.append(predicted_answer)
        decoded_labels.append(true_answer)

    # Convert "true"/"false" strings to binary (1 for true, 0 for false)
    decoded_preds_binary = [1 if pred == "true" else 0 for pred in decoded_preds]
    decoded_labels_binary = [1 if label == "true" else 0 for label in decoded_labels]

    # Calculate metrics
    accuracy = accuracy_score(decoded_labels_binary, decoded_preds_binary)
    precision = precision_score(decoded_labels_binary, decoded_preds_binary)
    recall = recall_score(decoded_labels_binary, decoded_preds_binary)
    f1 = f1_score(decoded_labels_binary, decoded_preds_binary)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Run inference and calculate metrics
metrics = run_inference_and_evaluate(model, tokenizer, tokenized_dataset["validation"])

# Print the metrics
print(metrics)


Token indices sequence length is longer than the specified maximum sequence length for this model (823 > 512). Running this sequence through the model will result in indexing errors


{'accuracy': 0.600611620795107, 'precision': 0.6602027324812693, 'recall': 0.7368421052631579, 'f1': 0.696420269642027}
