In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

In [15]:
from huggingface_hub import login

# Directly input your Hugging Face token here
huggingface_token = "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"

# Log in using the token
login(token=huggingface_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/eshwar/.cache/huggingface/token
Login successful


In [20]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=os.getenv("HUGGINGFACE_TOKEN"))

In [10]:
from datasets import load_dataset
# Load the SQuAD v2 dataset
squad_dataset = load_dataset("squad_v2")
print(squad_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [11]:
from datasets import concatenate_datasets
# Combine train and validation splits
combined_squad = concatenate_datasets([squad_dataset['train'], squad_dataset['validation']])

# Display the structure of the combined dataset
print(combined_squad)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})


In [12]:
from datasets import DatasetDict

# Split the combined dataset into train and test (80:20 split)
combined_squad = combined_squad.train_test_split(test_size=0.2, seed=1)

# Create train and test splits
train_squad = combined_squad['train']
test_squad = combined_squad['test']

# Display the structure of the splits
print(f"Train Split: {train_squad}")
print(f"Test Split: {test_squad}")


Train Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 113753
})
Test Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 28439
})


In [32]:
def preprocess_qa(examples):
    # Strip spaces from questions
    questions = [q.strip() for q in examples["question"]]
    
    # Tokenize questions and contexts
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation=True,
        max_length=384,
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    # Initialize lists to store start and end positions
    start_positions = []
    end_positions = []

    # Loop through each offset mapping
    for i, offsets in enumerate(inputs["offset_mapping"]):
        # Prevent out-of-range access for answers
        if i >= len(examples["answers"]):
            start_positions.append(0)
            end_positions.append(0)
            continue
        
        # Safeguard for empty answers
        if len(examples["answers"][i]["text"]) > 0:  # Check if there is an answer
            # Extract answer details
            answer_text = examples["answers"][i]["text"][0]
            answer_start = examples["answers"][i]["answer_start"][0]
            answer_end = answer_start + len(answer_text)

            # Find token start and end positions
            token_start = token_end = None
            for idx, (start, end) in enumerate(offsets):
                if start <= answer_start < end:
                    token_start = idx
                if start < answer_end <= end:
                    token_end = idx
                    break

            # Assign positions or default to CLS token index (0)
            start_positions.append(token_start if token_start is not None else 0)
            end_positions.append(token_end if token_end is not None else 0)
        else:
            # Assign default positions for empty answers
            start_positions.append(0)
            end_positions.append(0)

    # Add start and end positions to the tokenized inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    # Remove offset mapping to save memory
    inputs.pop("offset_mapping", None)
    
    return inputs


In [34]:
train_squad_tokenized = train_squad.map(
    preprocess_qa, batched=True, batch_size=100,
)
test_squad_tokenized = test_squad.map(
    preprocess_qa, batched=True, batch_size=100,
)

# Print structure
print(f"Preprocessed Train Split: {train_squad_tokenized}")
print(f"Preprocessed Test Split: {test_squad_tokenized}")


Map:   0%|          | 0/113753 [00:00<?, ? examples/s]

Map:   0%|          | 0/28439 [00:00<?, ? examples/s]

Preprocessed Train Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 113753
})
Preprocessed Test Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 28439
})


# QA Training

In [35]:
!pip install evaluate rouge_score nltk


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=a274cdcde33f25aeba4b4a3968bb0e4f7c458638e05db82c6f3f7b3b504b17d2
  Stored in directory: /home/eshwar/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score
Successfully installed absl-py-2.1.0 rouge_score-0.1.2


In [36]:
import evaluate
import nltk
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import f1_score

# Load the metrics
squad = evaluate.load("squad_v2")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

# Function to compute the metrics
def compute_metrics(pred):
    # Get the true labels and predictions
    labels = pred.label_ids
    preds = pred.predictions

    # Decode the predictions and labels (the tokenizer will help with this)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute exact match
    exact_match = squad.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute F1 score (using sklearn)
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute ROUGE score
    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute METEOR score
    meteor_score_value = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "exact_match": exact_match["exact_match"],
        "f1": f1,
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "meteor": meteor_score_value,
    }


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /home/eshwar/nltk_data...
[nltk_data] Downloading package punkt_tab to /home/eshwar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /home/eshwar/nltk_data...


In [None]:
# need to put model here

In [None]:
import logging
from transformers import Trainer, TrainingArguments

# Set up logging to output to console
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Define the training arguments with logging at the end of each epoch
training_args_qa = TrainingArguments(
    output_dir="./results_squad",  # Directory where results will be saved
    evaluation_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",        # Save checkpoints at the end of every epoch
    learning_rate=2e-5,           # Learning rate for fine-tuning
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=3,             # Number of training epochs
    weight_decay=0.01,              # Weight decay to prevent overfitting
    logging_dir="./logs",           # Directory for logging
    logging_strategy="epoch",      # Log at the end of each epoch
)

# Define the Trainer
trainer_qa = Trainer(
    model=model_qa,
    args=training_args_qa,
    train_dataset=encoded_squad["train"],  # Train dataset
    eval_dataset=encoded_squad["test"],   # Test dataset
    tokenizer=tokenizer,                  # Use the tokenizer for encoding
    data_collator=default_data_collator,  # Handles padding for variable-length sequences
    compute_metrics=compute_metrics,
)



In [None]:
from transformers import AutoModelForCausalLM
# Load the base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")


In [None]:
import torch
import torch.nn as nn

class ExtractiveQAModel(nn.Module):
    def __init__(self, base_model):
        super(ExtractiveQAModel, self).__init__()
        self.base_model = base_model
        self.qa_outputs = nn.Linear(base_model.config.hidden_size, 2)  # Start and end logits

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        # Get hidden states from the base model
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_length, hidden_size)
        
        # Compute start and end logits
        logits = self.qa_outputs(hidden_states)  # Shape: (batch_size, seq_length, 2)
        start_logits, end_logits = logits.split(1, dim=-1)  # Shape: (batch_size, seq_length, 1)
        start_logits = start_logits.squeeze(-1)  # Shape: (batch_size, seq_length)
        end_logits = end_logits.squeeze(-1)      # Shape: (batch_size, seq_length)
        
        return start_logits, end_logits


In [None]:
# Define the model
model_qa = ExtractiveQAModel(base_model)
model_qa.to("cuda")

In [None]:
# Start training with logging at the end of each epoch
trainer_qa.train()


In [None]:
# Evaluate the model after training
results_qa = trainer_qa.evaluate()

# Print the evaluation results
print(results_qa)