In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

In [15]:
from huggingface_hub import login

# Directly input your Hugging Face token here
huggingface_token = "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"

# Log in using the token
login(token=huggingface_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/eshwar/.cache/huggingface/token
Login successful


In [20]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=os.getenv("HUGGINGFACE_TOKEN"))

In [10]:
from datasets import load_dataset
# Load the SQuAD v2 dataset
squad_dataset = load_dataset("squad_v2")
print(squad_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [11]:
from datasets import concatenate_datasets
# Combine train and validation splits
combined_squad = concatenate_datasets([squad_dataset['train'], squad_dataset['validation']])

# Display the structure of the combined dataset
print(combined_squad)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})


In [12]:
from datasets import DatasetDict

# Split the combined dataset into train and test (80:20 split)
combined_squad = combined_squad.train_test_split(test_size=0.2, seed=1)

# Create train and test splits
train_squad = combined_squad['train']
test_squad = combined_squad['test']

# Display the structure of the splits
print(f"Train Split: {train_squad}")
print(f"Test Split: {test_squad}")


Train Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 113753
})
Test Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 28439
})


In [32]:
def preprocess_qa(examples):
    # Strip spaces from questions
    questions = [q.strip() for q in examples["question"]]
    
    # Tokenize questions and contexts
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation=True,
        max_length=384,
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    # Initialize lists to store start and end positions
    start_positions = []
    end_positions = []

    # Loop through each offset mapping
    for i, offsets in enumerate(inputs["offset_mapping"]):
        # Prevent out-of-range access for answers
        if i >= len(examples["answers"]):
            start_positions.append(0)
            end_positions.append(0)
            continue
        
        # Safeguard for empty answers
        if len(examples["answers"][i]["text"]) > 0:  # Check if there is an answer
            # Extract answer details
            answer_text = examples["answers"][i]["text"][0]
            answer_start = examples["answers"][i]["answer_start"][0]
            answer_end = answer_start + len(answer_text)

            # Find token start and end positions
            token_start = token_end = None
            for idx, (start, end) in enumerate(offsets):
                if start <= answer_start < end:
                    token_start = idx
                if start < answer_end <= end:
                    token_end = idx
                    break

            # Assign positions or default to CLS token index (0)
            start_positions.append(token_start if token_start is not None else 0)
            end_positions.append(token_end if token_end is not None else 0)
        else:
            # Assign default positions for empty answers
            start_positions.append(0)
            end_positions.append(0)

    # Add start and end positions to the tokenized inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    # Remove offset mapping to save memory
    inputs.pop("offset_mapping", None)
    
    return inputs


In [34]:
train_squad_tokenized = train_squad.map(
    preprocess_qa, batched=True, batch_size=100,
)
test_squad_tokenized = test_squad.map(
    preprocess_qa, batched=True, batch_size=100,
)

# Print structure
print(f"Preprocessed Train Split: {train_squad_tokenized}")
print(f"Preprocessed Test Split: {test_squad_tokenized}")


Map:   0%|          | 0/113753 [00:00<?, ? examples/s]

Map:   0%|          | 0/28439 [00:00<?, ? examples/s]

Preprocessed Train Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 113753
})
Preprocessed Test Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 28439
})
