In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [3]:
# Import necessary libraries
import torch
import torch.nn.functional as F
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, AdamW
from tqdm import tqdm
from datasets import load_dataset

In [4]:
# Load the teacher model (BERT large, fine-tuned on SQuAD)
teacher_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Example question and context for testing the teacher model
context = "The nearest gas station is 5 kilometers away on your right after the next intersection."
question = "How far is the nearest gas station?"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
# Create a question-answering pipeline for the teacher model
qa_pipeline = pipeline("question-answering", model=teacher_model, tokenizer=tokenizer)

# Get the teacher model's output (soft labels for distillation)
teacher_output = qa_pipeline(question=question, context=context)
print("Teacher Model Output:", teacher_output)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Teacher Model Output: {'score': 0.8629446029663086, 'start': 27, 'end': 39, 'answer': '5 kilometers'}


In [6]:
# Load the student model (DistilBERT) which is not fine-tuned on SQuAD
student_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
student_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Create a pipeline for the student model (for testing post-training)
student_qa_pipeline = pipeline("question-answering", model=student_model, tokenizer=student_tokenizer)


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
# Define distillation loss function using KL Divergence
def distill_loss(teacher_logits, student_logits, temperature=2.0):
    """
    Function to compute the knowledge distillation loss.
    Uses temperature scaling and KL divergence between teacher and student logits.
    """
    # Apply temperature scaling
    teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
    student_probs = F.log_softmax(student_logits / temperature, dim=-1)

    # Compute KL divergence loss between teacher and student logits
    loss = F.kl_div(student_probs, teacher_probs, reduction='batchmean') * (temperature ** 2)
    return loss


In [8]:
# Load a small subset of the SQuAD dataset for demonstration purposes
dataset = load_dataset("squad", split="train[:1%]")  # Only 1% of the training data


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [9]:
# Set up AdamW optimizer for training the student model
optimizer = AdamW(student_model.parameters(), lr=5e-5)

# Training loop over 3 epochs
for epoch in range(3):  # Training for 3 epochs for demo
    print(f"Epoch {epoch + 1}")
    epoch_loss = 0  # Initialize epoch loss

    # Loop through each data sample in the dataset
    for data in tqdm(dataset):
        context = data["context"]  # Extract context (passage)
        question = data["question"]  # Extract question

        # Tokenize inputs for both models
        inputs = tokenizer(context, question, return_tensors="pt", truncation=True, padding=True)

        # Remove 'token_type_ids' (not used by DistilBERT)
        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']

        # Get teacher model logits (start and end positions)
        with torch.no_grad():  # Teacher model is not updated, so no gradient is calculated
            teacher_outputs = teacher_model(**inputs)
        teacher_start_logits = teacher_outputs.start_logits
        teacher_end_logits = teacher_outputs.end_logits

        # Get student model logits (start and end positions)
        student_outputs = student_model(**inputs)
        student_start_logits = student_outputs.start_logits
        student_end_logits = student_outputs.end_logits

        # Compute distillation loss for both start and end logits
        loss_start = distill_loss(teacher_start_logits, student_start_logits)
        loss_end = distill_loss(teacher_end_logits, student_end_logits)

        # Combine the start and end losses for total loss
        total_loss = loss_start + loss_end

        # Backpropagation and optimization
        optimizer.zero_grad()  # Clear gradients
        total_loss.backward()  # Backpropagate the loss
        optimizer.step()  # Update the student model parameters

        epoch_loss += total_loss.item()  # Accumulate loss for the epoch

    print(f"Epoch Loss: {epoch_loss:.4f}")  # Print the loss for the epoch

print("Training Completed.")




Epoch 1


100%|██████████| 876/876 [36:55<00:00,  2.53s/it]


Epoch Loss: 6353.0845
Epoch 2


100%|██████████| 876/876 [36:34<00:00,  2.51s/it]


Epoch Loss: 3109.6936
Epoch 3


100%|██████████| 876/876 [36:48<00:00,  2.52s/it]

Epoch Loss: 1649.0561
Training Completed.





In [15]:
# Evaluate the student model after distillation using a test example
student_output = student_qa_pipeline(
    question="what is the capital city of France?",
    context="The capital city of France is Paris"
)
print("Student Model Output:", student_output)

Student Model Output: {'score': 0.18261368572711945, 'start': 30, 'end': 35, 'answer': 'Paris'}


In [16]:
# Save the fine-tuned student model and tokenizer to a directory
student_model.save_pretrained("/content/drive/MyDrive/D_student_model")
student_tokenizer.save_pretrained("/content/drive/MyDrive/D_student_model_T")

('/content/drive/MyDrive/D_student_model_T/tokenizer_config.json',
 '/content/drive/MyDrive/D_student_model_T/special_tokens_map.json',
 '/content/drive/MyDrive/D_student_model_T/vocab.txt',
 '/content/drive/MyDrive/D_student_model_T/added_tokens.json',
 '/content/drive/MyDrive/D_student_model_T/tokenizer.json')

In [6]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

teacher_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
teacher_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Load the student model and tokenizer from saved location
student_model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/MyDrive/D_student_model")
student_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/D_student_model_T")


# Create QA pipelines for both models
teacher_qa_pipeline = pipeline("question-answering", model=teacher_model, tokenizer=teacher_tokenizer)
student_qa_pipeline = pipeline("question-answering", model=student_model, tokenizer=student_tokenizer)

# Example questions and context
context = "The Eiffel Tower is located in Paris. It was completed in 1889."
questions = [
     "Where is the Eiffel Tower located?",
    "When was the Eiffel Tower completed?"
]

# Compare outputs
for question in questions:
    # Get teacher model's answer
    teacher_output = teacher_qa_pipeline(question=question, context=context)
    teacher_answer = teacher_output['answer']

    # Get student model's answer
    student_output = student_qa_pipeline(question=question, context=context)
    student_answer = student_output['answer']

    # Print results
    print(f"Question: {question}")
    print(f"Teacher Model Answer: {teacher_answer}")
    print(f"Student Model Answer: {student_answer}")
    print("-" * 50)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: Where is the Eiffel Tower located?
Teacher Model Answer: Paris
Student Model Answer: Paris
--------------------------------------------------
Question: When was the Eiffel Tower completed?
Teacher Model Answer: 1889
Student Model Answer: 1889
--------------------------------------------------
