In [1]:
# Install the necessary packages
!pip install accelerate transformers datasets evaluate torch




In [2]:
import torch
import os
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from huggingface_hub import login
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
from huggingface_hub import login

# Directly input your Hugging Face token here
huggingface_token = "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"

# Log in using the token
login(token=huggingface_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/eshwar/.cache/huggingface/token
Login successful


In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define model and tokenizer path or name
model_name = "google/gemma-2-2b-it"

# Load the Hugging Face token from environment variables
token = os.getenv("HUGGINGFACE_TOKEN")

# Load the tokenizer with authentication token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)


In [None]:

# Load the model for sequence classification with the token
model_st = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, use_auth_token=token)


In [None]:
# Sum the total number of parameters in the model
total_params = sum(p.numel() for p in modelPT.parameters())

# Display the total count of parameters
print(f"Total number of parameters in the modelPT: {total_params:,}")


In [None]:
# Print the architecture of the modelPT
print(modelPT)


## Model Parameter Comparison: Reported vs Calculated

In this section, we compare the **reported parameters** for the Gemma 2B model (as per the technical paper) with the **calculated parameters** based on the loaded model `modelPT`.

### Reported Parameters for Gemma 2B Model:
According to the technical paper, the parameters for the **Gemma 2B model** are split into **Embedding Parameters** and **Non-embedding Parameters**:

- **Embedding Parameters**: 590,118,912
- **Non-embedding Parameters**: 2,024,517,888
- **Total Parameters**: 2,614,636,800

These values are the ones reported in the paper and give an overview of the architecture of the model. The **embedding parameters** come from the `embed_tokens` layer, and the **non-embedding parameters** account for the rest of the layers, including attention mechanisms and feedforward layers.

### Calculated Parameters for `modelPT`:
Upon loading the `Gemma 2B` model (`modelPT`), we performed calculations to estimate the parameters based on the architecture. Here’s how they break down:

1. **Embedding Parameters**:
   - From the `embed_tokens` layer, the calculation is:
     \[
     256000 \times 2304 = 590,118,912
     \]
   This matches the **reported embedding parameters**.

2. **Non-embedding Parameters**:
   - Based on the number of layers and components (such as attention, feedforward, and normalization layers), we calculate the **non-embedding parameters** as approximately:
     \[
     2,024,222,976
     \]
   This is very close to the reported **non-embedding parameters** of **2,024,517,888**.

3. **Total Parameters**:
   - The **total parameters** calculated from the `modelPT` are:
     \[
     590,118,912 \text{ (Embedding)} + 2,024,222,976 \text{ (Non-embedding)} = 2,614,341,888
     \]
   This is very close to the **reported total parameters** of **2,614,636,800**.

### Difference Between Reported and Calculated Parameters:
The **small difference** between the reported and calculated total parameters is approximately **295,000** parameters:
\[
2,614,636,800 - 2,614,341,888 = 295,000
\]
This discrepancy is minimal and could be due to rounding errors or slight variations in how the model was initialized or structured in the implementation.

### Conclusion:
- **Reported Total Parameters**: 2,614,636,800
- **Calculated Total Parameters**: 2,614,341,888

The calculated parameters for the `modelPT` are almost identical to the reported parameters, with a small difference of approximately 295,000. This is a negligible difference, indicating that the loaded model closely follows the architecture described in the paper.



# SST-2

In [None]:
# Load the SST-2 dataset
sst2 = load_dataset("glue", "sst2")

print(sst2)

In [10]:
# Combine the train, validation, and test splits into a single dataset
combined_sst2 = concatenate_datasets([sst2['train'], sst2['validation'], sst2['test']])

# Print the structure of the combined dataset
print(combined_sst2)

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 70042
})


In [11]:
# Split the combined SST-2 dataset into 80% train and 20% test
train_test_split = combined_sst2.train_test_split(test_size=0.2, seed=1)

# Access the train and test splits
train_sst2 = train_test_split['train']
test_sst2 = train_test_split['test']

# Print the structure of both train and test splits
print(f"Train Split: {train_sst2}")
print(f"Test Split: {test_sst2}")


Train Split: Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 56033
})
Test Split: Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 14009
})


In [None]:
# Preprocess the data
def preprocess_data(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)

# Apply preprocessing to both train and test datasets
train_sst2_tokenzd = train_sst2.map(preprocess_data, batched=True)
test_sst2_tokenzd= test_sst2.map(preprocess_data, batched=True)

# Print the structure of the preprocessed datasets
print(f"Preprocessed Train Split: {train_sst2_tokenzd}")
print(f"Preprocessed Test Split: {test_sst2_tokenzd}")


In [None]:
# Custom dataset to wrap around Hugging Face datasets
class TextDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        # Extract the tokenized data and label
        item = self.dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(item['label'], dtype=torch.long)  # The target label
        }

    def __len__(self):
        return len(self.dataset)

# Create DataLoader instances
train_dataset = TextDataset(train_sst2_tokenzd)
test_dataset = TextDataset(test_sst2_tokenzd)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# import torch
# import torch.nn as nn

# # Define the custom model for sequence classification
# class Gemma2ForSequenceClassification(nn.Module):
#     def __init__(self, base_model, num_labels=2):
#         super().__init__()
#         self.base_model = base_model
#         self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask=None, labels=None):
#         # Get the outputs from the base model
#         outputs = self.base_model(input_ids, attention_mask=attention_mask)
#         # Extract the hidden state of the last token (classification token)
#         hidden_state = outputs.last_hidden_state[:, 0, :]
#         logits = self.classifier(hidden_state)
#         return logits

# # Load the pre-trained Gemma2 model
# base_model = modelPT  # Assuming modelPT is already loaded

# # Initialize the custom model
# modelFT = Gemma2ForSequenceClassification(base_model)


In [None]:


# Define the compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Set up training arguments
training_args_cls = TrainingArguments(
    output_dir="./results_sst2",                 # Directory to save the results
    evaluation_strategy="epoch",                 # Evaluate at the end of each epoch
    save_strategy="epoch",                       # Save model checkpoint after each epoch
    learning_rate=2e-5,                          # Learning rate
    per_device_train_batch_size=8,               # Batch size for training
    per_device_eval_batch_size=8,                # Batch size for evaluation
    num_train_epochs=3,                          # Number of epochs
    weight_decay=0.01,                           # Weight decay for regularization
)

# Initialize Trainer
trainer_cls = Trainer(
    model=modelFT,                                 # The model to fine-tune (your Gemma2ForSequenceClassification model)
    args=training_args_cls,                      # Training arguments defined above
    train_dataset=train_dataloader,                 # The preprocessed training dataset
    eval_dataset=test_dataloader,                   # The preprocessed test dataset
    compute_metrics=compute_metrics,             # Metrics function to compute during evaluation
)



In [None]:
# # Fine-tune the model
# trainer_cls.train()



In [None]:
# # Evaluate on the test set
# results_cls = trainer_cls.evaluate()
# print(results_cls)

# SQUAD

In [5]:
from datasets import load_dataset
# Load the SQuAD v2 dataset
squad_dataset = load_dataset("squad_v2")
print(squad_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [7]:
from datasets import concatenate_datasets
# Combine train and validation splits
combined_squad = concatenate_datasets([squad_dataset['train'], squad_dataset['validation']])

# Display the structure of the combined dataset
print(combined_squad)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 142192
})


In [8]:
from datasets import DatasetDict

# Split the combined dataset into train and test (80:20 split)
combined_squad = combined_squad.train_test_split(test_size=0.2, seed=1)

# Create train and test splits
train_squad = combined_squad['train']
test_squad = combined_squad['test']

# Display the structure of the splits
print(f"Train Split: {train_squad}")
print(f"Test Split: {test_squad}")


Train Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 113753
})
Test Split: Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 28439
})


In [9]:
example = train_squad[0]  # Accessing the first example in the train dataset
answers = example['answers']
print(answers['text'])  # Output: List of answers
print(answers['answer_start'])  # Output: List of start positions


['underground']
[32]


In [10]:
# Get the first few examples to inspect the answers field
def inspect_answers(dataset, num_samples=5):
    for i in range(num_samples):
        example = dataset[i]
        print(f"Example {i+1}:")
        print(f"Question: {example['question']}")
        print(f"Context: {example['context'][:500]}...")  # Print the first 500 characters of the context
        print(f"Answers: {example['answers']}")
        print("-" * 50)

# Call the function for the 'train_squad' dataset to get the first 5 examples' answers
inspect_answers(train_squad, num_samples=5)


Example 1:
Question: Where do many rodents live?
Context: Many rodents such as voles live underground. Marmots live almost exclusively above the tree line as high as 2,700 m (8,858 ft). They hibernate in large groups to provide warmth, and can be found in all areas of the Alps, in large colonies they build beneath the alpine pastures. Golden eagles and bearded vultures are the largest birds to be found in the Alps; they nest high on rocky ledges and can be found at altitudes of 2,400 m (7,874 ft). The most common bird is the alpine chough which can be ...
Answers: {'text': ['underground'], 'answer_start': [32]}
--------------------------------------------------
Example 2:
Question: How many people attend the Tulsa State Fair each year?
Context: During a 10-day run in Oklahoma City, the State Fair of Oklahoma attracts roughly one million people along with the annual Festival of the Arts. Large national pow-wows, various Latin and Asian heritage festivals, and cultural festivals such as 

In [16]:
def preprocess_qa(examples):
    questions = [q.strip() for q in examples["question"]]  # Ensure questions are stripped of extra spaces
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",  # Ensure consistent length
    )

    # Map overflow to original examples
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    # Initialize lists to store start and end positions
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        # Get the corresponding example index for this split
        sample_index = sample_mapping[i]
        answer = examples["answers"][sample_index]

        if len(answer["text"]) > 0:
            # Get the first answer's start and end positions
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            # Convert character positions to token positions
            sequence_ids = inputs.sequence_ids(i)
            context_start = sequence_ids.index(1)  # Context starts at the first token with sequence ID 1
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            # Check if the answer lies within the context window
            if offsets[context_start][0] <= start_char < offsets[context_end][1]:
                start_positions.append(next(idx for idx, (start, end) in enumerate(offsets) if start <= start_char < end))
                end_positions.append(next(idx for idx, (start, end) in enumerate(offsets) if start < end_char <= end))
            else:
                start_positions.append(0)
                end_positions.append(0)
        else:
            # No answer case
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [17]:
# Tokenize the train and test datasets
train_squad_tokenized = train_squad.map(preprocess_qa, batched=True)
test_squad_tokenized = test_squad.map(preprocess_qa, batched=True)

# Print structure
print(f"Preprocessed Train Split: {train_squad_tokenized}")
print(f"Preprocessed Test Split: {test_squad_tokenized}")


Map:   0%|          | 0/113753 [00:00<?, ? examples/s]

ArrowInvalid: Column 5 named input_ids expected length 1000 but got length 1011

In [31]:
!pip install evaluate rouge_score nltk


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=059697446cbb87320ec0ab27592bc87177994e51e6a43cc8f56208b609a9af5c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [32]:
import evaluate
import nltk
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from sklearn.metrics import f1_score

# Load the metrics
squad = evaluate.load("squad_v2")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

# Function to compute the metrics
def compute_metrics(pred):
    # Get the true labels and predictions
    labels = pred.label_ids
    preds = pred.predictions

    # Decode the predictions and labels (the tokenizer will help with this)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute exact match
    exact_match = squad.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute F1 score (using sklearn)
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute ROUGE score
    rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Compute METEOR score
    meteor_score_value = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "exact_match": exact_match["exact_match"],
        "f1": f1,
        "bleu": bleu_score["bleu"],
        "rouge1": rouge_score["rouge1"],
        "rouge2": rouge_score["rouge2"],
        "rougeL": rouge_score["rougeL"],
        "meteor": meteor_score_value,
    }


Downloading builder script:   0%|          | 0.00/6.47k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
import logging
from transformers import Trainer, TrainingArguments

# Set up logging to output to console
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Define the training arguments with logging at the end of each epoch
training_args_qa = TrainingArguments(
    output_dir="./results_squad",  # Directory where results will be saved
    evaluation_strategy="epoch",  # Evaluate at the end of every epoch
    save_strategy="epoch",        # Save checkpoints at the end of every epoch
    learning_rate=2e-5,           # Learning rate for fine-tuning
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    num_train_epochs=3,             # Number of training epochs
    weight_decay=0.01,              # Weight decay to prevent overfitting
    logging_dir="./logs",           # Directory for logging
    logging_strategy="epoch",      # Log at the end of each epoch
)

# Define the Trainer
trainer_qa = Trainer(
    model=model_qa,
    args=training_args_qa,
    train_dataset=encoded_squad["train"],  # Train dataset
    eval_dataset=encoded_squad["test"],   # Test dataset
    tokenizer=tokenizer,                  # Use the tokenizer for encoding
    data_collator=default_data_collator,  # Handles padding for variable-length sequences
    compute_metrics=compute_metrics,
)



In [None]:
# Start training with logging at the end of each epoch
trainer_qa.train()


In [None]:
# Evaluate the model after training
results_qa = trainer_qa.evaluate()

# Print the evaluation results
print(results_qa)