<a href="https://colab.research.google.com/github/AmirKage/TDS-GroupProject24-25/blob/Group_work_AmirKage/Testing_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
# Step 1: Install Required Libraries
# Ensure you have all necessary libraries installed.

!pip install transformers datasets torch evaluate pandas scikit-learn






In [46]:
# Step 2: Load and Fix the Dataset
# We ensure that short_answer is always inside long_answer.

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load the expanded dataset
expanded_qa_df = pd.read_csv("qa_dataset_expanded.csv")

# Split dataset into 80% training and 20% evaluation
train_df, eval_df = train_test_split(expanded_qa_df, test_size=0.2, random_state=42)

# Save train and evaluation datasets separately
train_df.to_csv("qa_train.csv", index=False)
eval_df.to_csv("qa_eval.csv", index=False)

# Load the split datasets into HuggingFace format
dataset = load_dataset('csv', data_files={'train': 'qa_train.csv', 'eval': 'qa_eval.csv'})



Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [47]:
# Step 3: Load DistilBERT and Tokenizer
# We use DistilBERT because it's faster and lightweight.

from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Load model
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")




Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Step 4: Tokenize Data and Add Start/End Positions
# We map character positions to token positions.

def add_token_positions(example):
    encoding = tokenizer(
        example["question"], example["answer"], truncation=True, padding="max_length", return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, (answer, context) in enumerate(zip(example["answer"], example["answer"])):
        start_char = context.find(answer)

        # Handle cases where the answer isn't found
        if start_char == -1:
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_char = start_char + len(answer)

        # Convert character positions to token positions
        offsets = encoding["offset_mapping"][i]

        token_start_index = next((idx for idx, (start, end) in enumerate(offsets) if start == start_char), None)
        token_end_index = next((idx for idx, (start, end) in enumerate(offsets) if end == end_char), None)

        # Ensure valid positions
        if token_start_index is None:
            token_start_index = 0
        if token_end_index is None:
            token_end_index = 0

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    encoding["start_positions"] = start_positions
    encoding["end_positions"] = end_positions
    encoding.pop("offset_mapping")  # Remove offset mapping after use

    return encoding

# Apply preprocessing function with fixed token positions
tokenized_dataset = dataset.map(add_token_positions, batched=True)


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [39]:
sample = tokenized_dataset["train"][0]
print("Question:", dataset["train"][0]["question"])
print("Long Answer:", dataset["train"][0]["long_answer"])
print("Short Answer:", dataset["train"][0]["short_answer"])
print("Start Token:", sample["start_positions"])
print("End Token:", sample["end_positions"])


Question: Do you consent to the processing of your data?
Long Answer: Yes, I consent to the processing of my personal data for purposes outlined in the agreement. Yes.
Short Answer: Yes.
Start Token: 30
End Token: 31


In [49]:
# Step 5: Fine-Tune DistilBERT
# We train DistilBERT on our dataset.

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,  # Longer training to account for dataset expansion
    weight_decay=0.01,
    learning_rate=3e-5,  # Lower learning rate for better convergence
    logging_steps=10,  # More frequent logs
)




In [None]:
# Step 6: Step 6: Fine-Tune DistilBERT
# We fine-tune the model using our expanded and properly split dataset.

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,5.1227,1.105942
2,0.6973,0.246537
3,0.321,0.202601


In [42]:
# Step 7: Step 7: Evaluate the Model Using RoBERTa
# To avoid bias, we evaluate the dataset using RoBERTa instead of DistilBERT.

from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

# Load RoBERTa model for evaluation
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
roberta_model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# Tokenize evaluation data
tokenized_eval = dataset["eval"].map(
    lambda x: roberta_tokenizer(x["question"], x["answer"], truncation=True, padding="max_length"),
    batched=True
)





Device set to use cpu


Exact Match: 0.00
F1 Score: 23.60
