In [None]:
!pip install -q transformers datasets torch evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import json
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AlbertTokenizer,
    AlbertForQuestionAnswering,
    AlbertTokenizerFast,
    AlbertForSequenceClassification,
    TrainingArguments,
    Trainer,
    default_data_collator
)



In [None]:
def load_coqa_dataset():
    return load_dataset("stanfordnlp/coqa")

dataset = load_coqa_dataset()
print(f"{len(dataset['train'])} training examples, {len(dataset['validation'])} validation examples")


README.md:   0%|          | 0.00/8.17k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/793k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7199 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

7199 training examples, 500 validation examples


In [None]:
def prepare_coqa_features(dataset_split):
    # Lists to collect individual examples
    questions = []
    contexts = []
    answers_texts = []

    for i in range(len(dataset_split)):
        story = dataset_split[i]["story"]

        # Each example has multiple questions and answers
        for q_idx, question in enumerate(dataset_split[i]["questions"]):
            # Make sure we have a corresponding answer
            if q_idx < len(dataset_split[i]["answers"]["input_text"]):
                questions.append(question)
                contexts.append(story)
                answers_texts.append(dataset_split[i]["answers"]["input_text"][q_idx])

    # Create a list of examples with the right structure
    processed_examples = []
    for q, c, a in zip(questions, contexts, answers_texts):
        processed_examples.append({
            "question": q,
            "story": c,
            "answer": {"input_text": a}
        })

    # Apply tokenization to each example
    # tokenized_examples = [prepare_train_features(ex) for ex in processed_examples]

    return processed_examples

In [None]:
def convert_albert_classifier_to_qa(albert_qnli_path):

    base_model = AlbertForSequenceClassification.from_pretrained(albert_qnli_path)
    config = base_model.config

    qa_model = AlbertForQuestionAnswering(config)
    qa_model.albert = base_model.albert

    return qa_model, base_model.config


In [None]:
albert_qnli_path = "./drive/MyDrive/w266/project/model_checkpoints/albert_qnli"

model, config = convert_albert_classifier_to_qa(albert_qnli_path)

tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v2", use_fast=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
train_prepared = prepare_coqa_features(dataset["train"])
val_prepared = prepare_coqa_features(dataset["validation"])

In [None]:
def tokenize_coqa_examples(examples, tokenizer=None, max_length=512):

    tokenized_examples = []

    for example in examples:
        question = example["question"]
        context = example["story"]
        answer_text = example["answer"]["input_text"]

        # Tokenize inputs
        inputs = tokenizer(
            question,
            context,
            max_length=max_length,
            truncation=True,
            return_offsets_mapping=True,
            padding="max_length",
        )

        # Convert tensors to lists to save memory
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        offset_mapping = inputs["offset_mapping"]

        # Find answer span in context
        answer_start = context.find(answer_text)
        if answer_start == -1:
            # If answer is not found, assign dummy values (typically ignored during training)
            start_positions = 0
            end_positions = 0
        else:
            answer_end = answer_start + len(answer_text)

            # Find token indices for answer
            start_token = None
            end_token = None

            for j, (start, end) in enumerate(offset_mapping):
                if start == 0 and end == 0:
                    # Skip special tokens
                    continue

                # Check if this token overlaps with answer
                if start_token is None and end > answer_start:
                    start_token = j

                if end_token is None and start >= answer_end:
                    end_token = j - 1
                    break

            start_positions = start_token if start_token is not None else 0
            end_positions = end_token if end_token is not None else len(offset_mapping) - 1

        # Add processed example to results (convert tensors to lists)
        tokenized_example = {
            "input_ids": input_ids,  # Already a list
            "attention_mask": attention_mask,  # Already a list
            "start_positions": start_positions,  # Single integer now
            "end_positions": end_positions  # Single integer now
        }

        tokenized_examples.append(tokenized_example)

    return tokenized_examples


In [None]:
tokenized_data = tokenize_coqa_examples(train_prepared, tokenizer=tokenizer)
tokenized_data_val = tokenize_coqa_examples(val_prepared, tokenizer=tokenizer)


In [None]:
from datasets import Dataset

# Convert list of dictionaries into a Hugging Face Dataset
tokenized_dataset = Dataset.from_list(tokenized_data)
tokenized_dataset_val = Dataset.from_list(tokenized_data_val)

# Ensure that PyTorch tensors are used during training
tokenized_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "start_positions", "end_positions"]
)
tokenized_dataset_val.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "start_positions", "end_positions"]
)

In [None]:
# from transformers import DataCollatorWithPadding
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [None]:
train_subset = tokenized_dataset.select(range(10000))
eval_subset = tokenized_dataset_val.select(range(800))

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_logits, end_logits = predictions  # Shape: (batch_size, seq_length)
    start_labels, end_labels = labels  # Shape: (batch_size,)

    # Get token index with highest probability
    start_preds = np.argmax(start_logits, axis=1)  # Shape: (batch_size,)
    end_preds = np.argmax(end_logits, axis=1)  # Shape: (batch_size,)

    # Compute accuracy
    start_accuracy = np.mean(start_preds == start_labels)
    end_accuracy = np.mean(end_preds == end_labels)

    return {"start_accuracy": start_accuracy, "end_accuracy": end_accuracy}


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=eval_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

# Train
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Start Accuracy,End Accuracy
1,4.9665,3.167006,0.42,0.425
2,3.1672,2.793087,0.435,0.4425
3,2.6964,2.795002,0.43,0.41625


TrainOutput(global_step=1875, training_loss=3.384952018229167, metrics={'train_runtime': 3089.6951, 'train_samples_per_second': 9.71, 'train_steps_per_second': 0.607, 'total_flos': 716943052800000.0, 'train_loss': 3.384952018229167, 'epoch': 3.0})

In [None]:
model.save_pretrained('./drive/MyDrive/w266/model_checkpoints/coqa', from_pt=True)
tokenizer.save_pretrained('./drive/MyDrive/w266/project/model_checkpoints/coqa', from_pt=True)

('./drive/MyDrive/w266/project/model_checkpoints/coqa/tokenizer_config.json',
 './drive/MyDrive/w266/project/model_checkpoints/coqa/special_tokens_map.json',
 './drive/MyDrive/w266/project/model_checkpoints/coqa/spiece.model',
 './drive/MyDrive/w266/project/model_checkpoints/coqa/added_tokens.json',
 './drive/MyDrive/w266/project/model_checkpoints/coqa/tokenizer.json')