<a href="https://colab.research.google.com/github/AmirKage/TDS-GroupProject24-25/blob/Group_work_AmirKage/Testing_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Required Libraries
# Ensure you have all necessary libraries installed.

!pip install transformers datasets torch evaluate pandas scikit-learn






In [None]:
# Step 2: Load and Fix the Dataset
# We ensure that short_answer is always inside long_answer.

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load the expanded dataset
expanded_qa_df = pd.read_csv("qa_dataset_expanded.csv")

# Split dataset into 80% training and 20% evaluation
train_df, eval_df = train_test_split(expanded_qa_df, test_size=0.2, random_state=42)

# Save train and evaluation datasets separately
train_df.to_csv("qa_train.csv", index=False)
eval_df.to_csv("qa_eval.csv", index=False)

# Load the split datasets into HuggingFace format
dataset = load_dataset('csv', data_files={'train': 'qa_train.csv', 'eval': 'qa_eval.csv'})



Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [None]:
# Step 3: Load DistilBERT and Tokenizer
# We use DistilBERT because it's faster and lightweight.

from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Load model
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")




Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
# Step 4: Tokenize Data and Add Start/End Positions
# We map character positions to token positions.

def add_token_positions(example):
    # Tokenize the question and context (long_answer)
    encoding = tokenizer(
        example["question"],
        example["answer"],
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, (answer, context) in enumerate(zip(example["answer"], example["answer"])):
        start_char = context.find(answer)

        # Handle cases where the answer isn't found
        if start_char == -1:
            start_positions.append(0)
            end_positions.append(0)
            continue

        end_char = start_char + len(answer)

        # Convert character positions to token positions
        offsets = encoding["offset_mapping"][i]

        token_start_index = None
        token_end_index = None

        for idx, (start, end) in enumerate(offsets):
            if start <= start_char < end:  # Find first token that includes the start_char
                token_start_index = idx
            if start < end_char <= end:  # Find last token that includes the end_char
                token_end_index = idx

        # If the token positions are not found, default them to the first token
        if token_start_index is None:
            token_start_index = 0
        if token_end_index is None:
            token_end_index = token_start_index  # Avoid misalignment

        start_positions.append(token_start_index)
        end_positions.append(token_end_index)

    encoding["start_positions"] = start_positions
    encoding["end_positions"] = end_positions
    encoding.pop("offset_mapping")  # Remove offset mapping after use

    return encoding


from datasets import load_dataset

# Load dataset
dataset = load_dataset('csv', data_files={'train': 'qa_train.csv', 'eval': 'qa_eval.csv'})

# Apply preprocessing function with the fixed tokenization alignment
tokenized_dataset = dataset.map(add_token_positions, batched=True)


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

In [None]:
# Step 5: Fine-Tune DistilBERT
# We train DistilBERT on our dataset.

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Keep the full 10 but apply early stopping
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_steps=5,
    load_best_model_at_end=True,  # 🚀 Loads the best model at the lowest validation loss
    save_total_limit=2,  # 🚀 Keeps only the best 2 models to avoid overfitting
    metric_for_best_model="eval_loss",  # 🚀 Stops when eval loss stops improving
    greater_is_better=False,  # 🚀 Lower loss is better
)

# In previous attempts, The validation loss starts increasing after epoch 5, meaning the model might be overfitting.

In [None]:
# Step 6: Step 6: Fine-Tune DistilBERT
# We fine-tune the model using our expanded and properly split dataset.

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1025,0.244184
2,0.0728,0.341373
3,0.0799,0.311941
4,0.0213,0.349428
5,0.0893,0.353093


TrainOutput(global_step=90, training_loss=0.08974114076958763, metrics={'train_runtime': 2830.0122, 'train_samples_per_second': 0.254, 'train_steps_per_second': 0.032, 'total_flos': 94070232023040.0, 'train_loss': 0.08974114076958763, 'epoch': 5.0})

In [None]:
# Step 7: Step 7: Evaluate the Model Using RoBERTa
# To avoid bias, we evaluate the dataset using RoBERTa instead of DistilBERT.

from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

# Load RoBERTa model for evaluation
roberta_tokenizer = RobertaTokenizerFast.from_pretrained("deepset/roberta-base-squad2")
roberta_model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# Tokenize evaluation data
tokenized_eval = dataset["eval"].map(
    lambda x: roberta_tokenizer(x["question"], x["answer"], truncation=True, padding="max_length"),
    batched=True
)



In [None]:
# Step 8: Evaluate Model Performance (Exact Match & F1 Score)
# We measure performance using Exact Match (EM) and F1-score.

import evaluate
from transformers import pipeline

# Load evaluation metric
metric = evaluate.load("squad")

# Define QA pipeline
qa_pipeline = pipeline("question-answering", model=roberta_model, tokenizer=roberta_tokenizer)

# Run evaluation
predictions = []
references = []

for row in dataset["eval"]:
    result = qa_pipeline(question=row["question"], context=row["answer"])
    predictions.append({"id": str(row["question"]), "prediction_text": result["answer"]})
    references.append({"id": str(row["question"]), "answers": {"text": [row["answer"]], "answer_start": [0]}})

# Compute evaluation scores
results = metric.compute(predictions=predictions, references=references)

print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score: {results['f1']:.2f}")


Device set to use cpu


Exact Match: 5.56
F1 Score: 41.17
