In [2]:
!pip install transformers
!pip install torch
!pip install accelerate
!pip install numpy
!pip install tensorboardx
!pip install datasets

from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
from datasets import load_dataset
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}") 

# Load the SQuAD dataset
squad = load_dataset("squad")

[0mCollecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting a

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [3]:
# Replace original teacher model initialization
teacher_model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
teacher_model = AutoModelForQuestionAnswering.from_pretrained(
    teacher_model_name,
    trust_remote_code=True  # Required for DeepSeek models
)
teacher_tokenizer = AutoTokenizer.from_pretrained(
    teacher_model_name,
    trust_remote_code=True,
    use_fast=False  # Recommended for DeepSeek models
)

# Modify preprocessing for DeepSeek's tokenization
def preprocess_teacher_train(example):
    inputs = teacher_tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,  # Matches DeepSeek's context window
        stride=96,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        add_special_tokens=True  # Explicitly enable special tokens
    )
    

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some weights of Qwen2ForQuestionAnswering were not initialized from the model checkpoint at deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight', 'transformer.embed_tokens.weight', 'transformer.layers.0.input_layernorm.weight', 'transformer.layers.0.mlp.down_proj.weight', 'transformer.layers.0.mlp.gate_proj.weight', 'transformer.layers.0.mlp.up_proj.weight', 'transformer.layers.0.post_attention_layernorm.weight', 'transformer.layers.0.self_attn.k_proj.bias', 'transformer.layers.0.self_attn.k_proj.weight', 'transformer.layers.0.self_attn.o_proj.weight', 'transformer.layers.0.self_attn.q_proj.bias', 'transformer.layers.0.self_attn.q_proj.weight', 'transformer.layers.0.self_attn.v_proj.bias', 'transformer.layers.0.self_attn.v_proj.weight', 'transformer.layers.1.input_layernorm.weight', 'transformer.layers.1.mlp.down_proj.weight'

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [4]:
# Should do a training pass on SQuAD1.1 to initialise the weights in the Teacher model.
# This will also stop the warning from displaying

# New function to preprocess training data for teacher
def preprocess_teacher_train(example):
    # roberta-base-squad-v1
    #inputs = teacher_tokenizer(
    #    example["question"],
    #    example["context"],
    #    truncation=True,
    #    max_length=384,
    #    stride=128,
    #    return_overflowing_tokens=True,
    #    return_offsets_mapping=True,
    #    padding="max_length"
    #)
    
    # DeepSeek-R1-Distill-Qwen-1.5B
    inputs = teacher_tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        stride=96,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        add_special_tokens=True  # Explicitly enable special tokens
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = example["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        
        sequence_ids = inputs.sequence_ids(i)

        # Handle empty sequence_ids case
        if not sequence_ids:
            start_positions.append(0)
            end_positions.append(0)
            continue

        # Find context start with boundary checks
        idx = 0
        while idx < len(sequence_ids) and sequence_ids[idx] != 1:
            idx += 1
        context_start = idx if idx < len(sequence_ids) else 0

        # Find context end with boundary checks
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1 if idx > 0 else 0

        # Handle answer position calculation
        if (context_start >= len(offset) or 
            context_end >= len(offset) or
            offset[context_start][0] > end_char or 
            offset[context_end][1] < start_char):
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Find start position with bounds checking
            idx = context_start
            while idx <= context_end and idx < len(offset) and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(min(idx - 1, len(offset)-1))
            
            # Find end position with bounds checking
            idx = context_end
            while idx >= context_start and idx < len(offset) and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(min(idx + 1, len(offset)-1))
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Teacher Training Arguments
teacher_training_args = TrainingArguments(
    output_dir="./teacher_train",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,  # Increased logging frequency
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=1e-5,
    report_to="tensorboard",
    fp16=True,  # Enable mixed precision training
    dataloader_num_workers=4,
)

# Custom progress callback
class TeacherTrainingProgress(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"🚀 Starting training with {args.num_train_epochs} epochs")
        print(f"📊 Batch size: {args.per_device_train_batch_size}")
        print(f"🔍 Evaluation every {args.eval_steps} steps")

    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\n⏳ Starting epoch {state.epoch}/{args.num_train_epochs}")
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and 'loss' in logs:
            print(f"Step {state.global_step}: Loss {logs['loss']:.4f}")
        if logs and 'eval_loss' in logs:
            print(f"Validation Loss: {logs['eval_loss']:.4f}")
            print(f"Exact Match: {logs['eval_exact_match']:.2f}%")
            print(f"F1 Score: {logs['eval_f1']:.2f}%")

# Add metrics computation to Trainer
def compute_metrics(p):

    # Convert logits to predictions
    start_pred = np.argmax(p.predictions[0], axis=1)
    end_pred = np.argmax(p.predictions[1], axis=1)
    
    # Get true positions
    start_true = p.label_ids[0]
    end_true = p.label_ids[1]
    
    # Calculate exact match
    exact_matches = np.logical_and(
        start_pred == start_true,
        end_pred == end_true
    )

    # Calculate span F1
    def overlap_f1(p_start, p_end, t_start, t_end):
        pred_span = set(range(p_start, p_end+1))
        true_span = set(range(t_start, t_end+1))
        overlap = len(pred_span & true_span)
        precision = overlap / len(pred_span) if pred_span else 0
        recall = overlap / len(true_span) if true_span else 0
        return 2*(precision*recall)/(precision+recall) if (precision+recall) else 0
    
    f1_scores = [
        overlap_f1(sp, ep, st, et)
        for sp, ep, st, et in zip(start_pred, end_pred, start_true, end_true)
    ]
    
    return {
        "exact_match": np.mean(exact_matches) * 100,
        "f1": np.mean(f1_scores) * 100
    }


# Create Trainer for teacher
teacher_trainer = Trainer(
    model=teacher_model,
    args=teacher_training_args,
    train_dataset=squad["train"].map(preprocess_teacher_train, batched=True, remove_columns=squad["train"].column_names),
    eval_dataset=squad["validation"].map(preprocess_teacher_train, batched=True, remove_columns=squad["validation"].column_names),
    tokenizer=teacher_tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[TeacherTrainingProgress()]
)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

  teacher_trainer = Trainer(


In [5]:
# Train teacher model
print("\nTraining Teacher Model on SQuAD1.1...")
teacher_trainer.train()
teacher_model.save_pretrained("./DeepSeek-R1-Distill-Qwen-1.5B-trained")
teacher_tokenizer.save_pretrained("./DeepSeek-R1-Distill-Qwen-1.5B-trained")


Training Teacher Model on SQuAD1.1...
🚀 Starting training with 3 epochs
📊 Batch size: 16
🔍 Evaluation every None steps

⏳ Starting epoch 0/3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Exact Match,F1
1,0.0,,0.479819,0.479819
2,0.0,,0.479819,0.479819
3,0.0,,0.479819,0.479819


NaN or Inf found in input tensor.


Step 50: Loss 0.0000


NaN or Inf found in input tensor.


Step 100: Loss 0.0000


NaN or Inf found in input tensor.


Step 150: Loss 0.0000


NaN or Inf found in input tensor.


Step 200: Loss 0.0000


NaN or Inf found in input tensor.


Step 250: Loss 0.0000


NaN or Inf found in input tensor.


Step 300: Loss 0.0000


NaN or Inf found in input tensor.


Step 350: Loss 0.0000


NaN or Inf found in input tensor.


Step 400: Loss 0.0000


NaN or Inf found in input tensor.


Step 450: Loss 0.0000


NaN or Inf found in input tensor.


Step 500: Loss 0.0000


NaN or Inf found in input tensor.


Step 550: Loss 0.0000


NaN or Inf found in input tensor.


Step 600: Loss 0.0000


NaN or Inf found in input tensor.


Step 650: Loss 0.0000


NaN or Inf found in input tensor.


Step 700: Loss 0.0000


NaN or Inf found in input tensor.


Step 750: Loss 0.0000


NaN or Inf found in input tensor.


Step 800: Loss 0.0000


NaN or Inf found in input tensor.


Step 850: Loss 0.0000


NaN or Inf found in input tensor.


Step 900: Loss 0.0000


NaN or Inf found in input tensor.


Step 950: Loss 0.0000


NaN or Inf found in input tensor.


Step 1000: Loss 0.0000


NaN or Inf found in input tensor.


Step 1050: Loss 0.0000


NaN or Inf found in input tensor.


Step 1100: Loss 0.0000


NaN or Inf found in input tensor.


Step 1150: Loss 0.0000


NaN or Inf found in input tensor.


Step 1200: Loss 0.0000


NaN or Inf found in input tensor.


Step 1250: Loss 0.0000


NaN or Inf found in input tensor.


Step 1300: Loss 0.0000


NaN or Inf found in input tensor.


Step 1350: Loss 0.0000


NaN or Inf found in input tensor.


Step 1400: Loss 0.0000


NaN or Inf found in input tensor.


Step 1450: Loss 0.0000


NaN or Inf found in input tensor.


Step 1500: Loss 0.0000


NaN or Inf found in input tensor.


Step 1550: Loss 0.0000


NaN or Inf found in input tensor.


Step 1600: Loss 0.0000


NaN or Inf found in input tensor.


Step 1650: Loss 0.0000


NaN or Inf found in input tensor.


Step 1700: Loss 0.0000


NaN or Inf found in input tensor.


Step 1750: Loss 0.0000


NaN or Inf found in input tensor.


Step 1800: Loss 0.0000


NaN or Inf found in input tensor.


Step 1850: Loss 0.0000


NaN or Inf found in input tensor.


Step 1900: Loss 0.0000


NaN or Inf found in input tensor.


Step 1950: Loss 0.0000


NaN or Inf found in input tensor.


Step 2000: Loss 0.0000


NaN or Inf found in input tensor.


Step 2050: Loss 0.0000


NaN or Inf found in input tensor.


Step 2100: Loss 0.0000


NaN or Inf found in input tensor.


Step 2150: Loss 0.0000


NaN or Inf found in input tensor.


Step 2200: Loss 0.0000


NaN or Inf found in input tensor.


Step 2250: Loss 0.0000


NaN or Inf found in input tensor.


Step 2300: Loss 0.0000


NaN or Inf found in input tensor.


Step 2350: Loss 0.0000


NaN or Inf found in input tensor.


Step 2400: Loss 0.0000


NaN or Inf found in input tensor.


Step 2450: Loss 0.0000


NaN or Inf found in input tensor.


Step 2500: Loss 0.0000


NaN or Inf found in input tensor.


Step 2550: Loss 0.0000


NaN or Inf found in input tensor.


Step 2600: Loss 0.0000


NaN or Inf found in input tensor.


Step 2650: Loss 0.0000


NaN or Inf found in input tensor.


Step 2700: Loss 0.0000


NaN or Inf found in input tensor.


Step 2750: Loss 0.0000


NaN or Inf found in input tensor.


Step 2800: Loss 0.0000


NaN or Inf found in input tensor.


Step 2850: Loss 0.0000


NaN or Inf found in input tensor.


Step 2900: Loss 0.0000


NaN or Inf found in input tensor.


Step 2950: Loss 0.0000


NaN or Inf found in input tensor.


Step 3000: Loss 0.0000


NaN or Inf found in input tensor.


Step 3050: Loss 0.0000


NaN or Inf found in input tensor.


Step 3100: Loss 0.0000


NaN or Inf found in input tensor.


Step 3150: Loss 0.0000


NaN or Inf found in input tensor.


Step 3200: Loss 0.0000


NaN or Inf found in input tensor.


Step 3250: Loss 0.0000


NaN or Inf found in input tensor.


Step 3300: Loss 0.0000


NaN or Inf found in input tensor.


Step 3350: Loss 0.0000


NaN or Inf found in input tensor.


Step 3400: Loss 0.0000


NaN or Inf found in input tensor.


Step 3450: Loss 0.0000


NaN or Inf found in input tensor.


Step 3500: Loss 0.0000


NaN or Inf found in input tensor.


Step 3550: Loss 0.0000


NaN or Inf found in input tensor.


Step 3600: Loss 0.0000


NaN or Inf found in input tensor.


Step 3650: Loss 0.0000


NaN or Inf found in input tensor.


Step 3700: Loss 0.0000


NaN or Inf found in input tensor.


Step 3750: Loss 0.0000


NaN or Inf found in input tensor.


Step 3800: Loss 0.0000


NaN or Inf found in input tensor.


Step 3850: Loss 0.0000


NaN or Inf found in input tensor.


Step 3900: Loss 0.0000


NaN or Inf found in input tensor.


Step 3950: Loss 0.0000


NaN or Inf found in input tensor.


Step 4000: Loss 0.0000


NaN or Inf found in input tensor.


Step 4050: Loss 0.0000


NaN or Inf found in input tensor.


Step 4100: Loss 0.0000


NaN or Inf found in input tensor.


Step 4150: Loss 0.0000


NaN or Inf found in input tensor.


Step 4200: Loss 0.0000


NaN or Inf found in input tensor.


Step 4250: Loss 0.0000


NaN or Inf found in input tensor.


Step 4300: Loss 0.0000


NaN or Inf found in input tensor.


Step 4350: Loss 0.0000


NaN or Inf found in input tensor.


Step 4400: Loss 0.0000


NaN or Inf found in input tensor.


Step 4450: Loss 0.0000


NaN or Inf found in input tensor.


Step 4500: Loss 0.0000


NaN or Inf found in input tensor.


Step 4550: Loss 0.0000


NaN or Inf found in input tensor.


Step 4600: Loss 0.0000


NaN or Inf found in input tensor.


Step 4650: Loss 0.0000


NaN or Inf found in input tensor.


Step 4700: Loss 0.0000


NaN or Inf found in input tensor.


Step 4750: Loss 0.0000


NaN or Inf found in input tensor.


Step 4800: Loss 0.0000


NaN or Inf found in input tensor.


Step 4850: Loss 0.0000


NaN or Inf found in input tensor.


Step 4900: Loss 0.0000


NaN or Inf found in input tensor.


Step 4950: Loss 0.0000


NaN or Inf found in input tensor.


Step 5000: Loss 0.0000


NaN or Inf found in input tensor.


Step 5050: Loss 0.0000


NaN or Inf found in input tensor.


Step 5100: Loss 0.0000


NaN or Inf found in input tensor.


Step 5150: Loss 0.0000


NaN or Inf found in input tensor.


Step 5200: Loss 0.0000


NaN or Inf found in input tensor.


Step 5250: Loss 0.0000


NaN or Inf found in input tensor.


Step 5300: Loss 0.0000


NaN or Inf found in input tensor.


Step 5350: Loss 0.0000


NaN or Inf found in input tensor.


Step 5400: Loss 0.0000


NaN or Inf found in input tensor.


Step 5450: Loss 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation Loss: nan
Exact Match: 0.48%
F1 Score: 0.48%

⏳ Starting epoch 1.0/3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step 5500: Loss 0.0000


NaN or Inf found in input tensor.


Step 5550: Loss 0.0000


NaN or Inf found in input tensor.


Step 5600: Loss 0.0000


NaN or Inf found in input tensor.


Step 5650: Loss 0.0000


NaN or Inf found in input tensor.


Step 5700: Loss 0.0000


NaN or Inf found in input tensor.


Step 5750: Loss 0.0000


NaN or Inf found in input tensor.


Step 5800: Loss 0.0000


NaN or Inf found in input tensor.


Step 5850: Loss 0.0000


NaN or Inf found in input tensor.


Step 5900: Loss 0.0000


NaN or Inf found in input tensor.


Step 5950: Loss 0.0000


NaN or Inf found in input tensor.


Step 6000: Loss 0.0000


NaN or Inf found in input tensor.


Step 6050: Loss 0.0000


NaN or Inf found in input tensor.


Step 6100: Loss 0.0000


NaN or Inf found in input tensor.


Step 6150: Loss 0.0000


NaN or Inf found in input tensor.


Step 6200: Loss 0.0000


NaN or Inf found in input tensor.


Step 6250: Loss 0.0000


NaN or Inf found in input tensor.


Step 6300: Loss 0.0000


NaN or Inf found in input tensor.


Step 6350: Loss 0.0000


NaN or Inf found in input tensor.


Step 6400: Loss 0.0000


NaN or Inf found in input tensor.


Step 6450: Loss 0.0000


NaN or Inf found in input tensor.


Step 6500: Loss 0.0000


NaN or Inf found in input tensor.


Step 6550: Loss 0.0000


NaN or Inf found in input tensor.


Step 6600: Loss 0.0000


NaN or Inf found in input tensor.


Step 6650: Loss 0.0000


NaN or Inf found in input tensor.


Step 6700: Loss 0.0000


NaN or Inf found in input tensor.


Step 6750: Loss 0.0000


NaN or Inf found in input tensor.


Step 6800: Loss 0.0000


NaN or Inf found in input tensor.


Step 6850: Loss 0.0000


NaN or Inf found in input tensor.


Step 6900: Loss 0.0000


NaN or Inf found in input tensor.


Step 6950: Loss 0.0000


NaN or Inf found in input tensor.


Step 7000: Loss 0.0000


NaN or Inf found in input tensor.


Step 7050: Loss 0.0000


NaN or Inf found in input tensor.


Step 7100: Loss 0.0000


NaN or Inf found in input tensor.


Step 7150: Loss 0.0000


NaN or Inf found in input tensor.


Step 7200: Loss 0.0000


NaN or Inf found in input tensor.


Step 7250: Loss 0.0000


NaN or Inf found in input tensor.


Step 7300: Loss 0.0000


NaN or Inf found in input tensor.


Step 7350: Loss 0.0000


NaN or Inf found in input tensor.


Step 7400: Loss 0.0000


NaN or Inf found in input tensor.


Step 7450: Loss 0.0000


NaN or Inf found in input tensor.


Step 7500: Loss 0.0000


NaN or Inf found in input tensor.


Step 7550: Loss 0.0000


NaN or Inf found in input tensor.


Step 7600: Loss 0.0000


NaN or Inf found in input tensor.


Step 7650: Loss 0.0000


NaN or Inf found in input tensor.


Step 7700: Loss 0.0000


NaN or Inf found in input tensor.


Step 7750: Loss 0.0000


NaN or Inf found in input tensor.


Step 7800: Loss 0.0000


NaN or Inf found in input tensor.


Step 7850: Loss 0.0000


NaN or Inf found in input tensor.


Step 7900: Loss 0.0000


NaN or Inf found in input tensor.


Step 7950: Loss 0.0000


NaN or Inf found in input tensor.


Step 8000: Loss 0.0000


NaN or Inf found in input tensor.


Step 8050: Loss 0.0000


NaN or Inf found in input tensor.


Step 8100: Loss 0.0000


NaN or Inf found in input tensor.


Step 8150: Loss 0.0000


NaN or Inf found in input tensor.


Step 8200: Loss 0.0000


NaN or Inf found in input tensor.


Step 8250: Loss 0.0000


NaN or Inf found in input tensor.


Step 8300: Loss 0.0000


NaN or Inf found in input tensor.


Step 8350: Loss 0.0000


NaN or Inf found in input tensor.


Step 8400: Loss 0.0000


NaN or Inf found in input tensor.


Step 8450: Loss 0.0000


NaN or Inf found in input tensor.


Step 8500: Loss 0.0000


NaN or Inf found in input tensor.


Step 8550: Loss 0.0000


NaN or Inf found in input tensor.


Step 8600: Loss 0.0000


NaN or Inf found in input tensor.


Step 8650: Loss 0.0000


NaN or Inf found in input tensor.


Step 8700: Loss 0.0000


NaN or Inf found in input tensor.


Step 8750: Loss 0.0000


NaN or Inf found in input tensor.


Step 8800: Loss 0.0000


NaN or Inf found in input tensor.


Step 8850: Loss 0.0000


NaN or Inf found in input tensor.


Step 8900: Loss 0.0000


NaN or Inf found in input tensor.


Step 8950: Loss 0.0000


NaN or Inf found in input tensor.


Step 9000: Loss 0.0000


NaN or Inf found in input tensor.


Step 9050: Loss 0.0000


NaN or Inf found in input tensor.


Step 9100: Loss 0.0000


NaN or Inf found in input tensor.


Step 9150: Loss 0.0000


NaN or Inf found in input tensor.


Step 9200: Loss 0.0000


NaN or Inf found in input tensor.


Step 9250: Loss 0.0000


NaN or Inf found in input tensor.


Step 9300: Loss 0.0000


NaN or Inf found in input tensor.


Step 9350: Loss 0.0000


NaN or Inf found in input tensor.


Step 9400: Loss 0.0000


NaN or Inf found in input tensor.


Step 9450: Loss 0.0000


NaN or Inf found in input tensor.


Step 9500: Loss 0.0000


NaN or Inf found in input tensor.


Step 9550: Loss 0.0000


NaN or Inf found in input tensor.


Step 9600: Loss 0.0000


NaN or Inf found in input tensor.


Step 9650: Loss 0.0000


NaN or Inf found in input tensor.


Step 9700: Loss 0.0000


NaN or Inf found in input tensor.


Step 9750: Loss 0.0000


NaN or Inf found in input tensor.


Step 9800: Loss 0.0000


NaN or Inf found in input tensor.


Step 9850: Loss 0.0000


NaN or Inf found in input tensor.


Step 9900: Loss 0.0000


NaN or Inf found in input tensor.


Step 9950: Loss 0.0000


NaN or Inf found in input tensor.


Step 10000: Loss 0.0000


NaN or Inf found in input tensor.


Step 10050: Loss 0.0000


NaN or Inf found in input tensor.


Step 10100: Loss 0.0000


NaN or Inf found in input tensor.


Step 10150: Loss 0.0000


NaN or Inf found in input tensor.


Step 10200: Loss 0.0000


NaN or Inf found in input tensor.


Step 10250: Loss 0.0000


NaN or Inf found in input tensor.


Step 10300: Loss 0.0000


NaN or Inf found in input tensor.


Step 10350: Loss 0.0000


NaN or Inf found in input tensor.


Step 10400: Loss 0.0000


NaN or Inf found in input tensor.


Step 10450: Loss 0.0000


NaN or Inf found in input tensor.


Step 10500: Loss 0.0000


NaN or Inf found in input tensor.


Step 10550: Loss 0.0000


NaN or Inf found in input tensor.


Step 10600: Loss 0.0000


NaN or Inf found in input tensor.


Step 10650: Loss 0.0000


NaN or Inf found in input tensor.


Step 10700: Loss 0.0000


NaN or Inf found in input tensor.


Step 10750: Loss 0.0000


NaN or Inf found in input tensor.


Step 10800: Loss 0.0000


NaN or Inf found in input tensor.


Step 10850: Loss 0.0000


NaN or Inf found in input tensor.


Step 10900: Loss 0.0000


NaN or Inf found in input tensor.


Step 10950: Loss 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation Loss: nan
Exact Match: 0.48%
F1 Score: 0.48%

⏳ Starting epoch 2.0/3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step 11000: Loss 0.0000


NaN or Inf found in input tensor.


Step 11050: Loss 0.0000


NaN or Inf found in input tensor.


Step 11100: Loss 0.0000


NaN or Inf found in input tensor.


Step 11150: Loss 0.0000


NaN or Inf found in input tensor.


Step 11200: Loss 0.0000


NaN or Inf found in input tensor.


Step 11250: Loss 0.0000


NaN or Inf found in input tensor.


Step 11300: Loss 0.0000


NaN or Inf found in input tensor.


Step 11350: Loss 0.0000


NaN or Inf found in input tensor.


Step 11400: Loss 0.0000


NaN or Inf found in input tensor.


Step 11450: Loss 0.0000


NaN or Inf found in input tensor.


Step 11500: Loss 0.0000


NaN or Inf found in input tensor.


Step 11550: Loss 0.0000


NaN or Inf found in input tensor.


Step 11600: Loss 0.0000


NaN or Inf found in input tensor.


Step 11650: Loss 0.0000


NaN or Inf found in input tensor.


Step 11700: Loss 0.0000


NaN or Inf found in input tensor.


Step 11750: Loss 0.0000


NaN or Inf found in input tensor.


Step 11800: Loss 0.0000


NaN or Inf found in input tensor.


Step 11850: Loss 0.0000


NaN or Inf found in input tensor.


Step 11900: Loss 0.0000


NaN or Inf found in input tensor.


Step 11950: Loss 0.0000


NaN or Inf found in input tensor.


Step 12000: Loss 0.0000


NaN or Inf found in input tensor.


Step 12050: Loss 0.0000


NaN or Inf found in input tensor.


Step 12100: Loss 0.0000


NaN or Inf found in input tensor.


Step 12150: Loss 0.0000


NaN or Inf found in input tensor.


Step 12200: Loss 0.0000


NaN or Inf found in input tensor.


Step 12250: Loss 0.0000


NaN or Inf found in input tensor.


Step 12300: Loss 0.0000


NaN or Inf found in input tensor.


Step 12350: Loss 0.0000


NaN or Inf found in input tensor.


Step 12400: Loss 0.0000


NaN or Inf found in input tensor.


Step 12450: Loss 0.0000


NaN or Inf found in input tensor.


Step 12500: Loss 0.0000


NaN or Inf found in input tensor.


Step 12550: Loss 0.0000


NaN or Inf found in input tensor.


Step 12600: Loss 0.0000


NaN or Inf found in input tensor.


Step 12650: Loss 0.0000


NaN or Inf found in input tensor.


Step 12700: Loss 0.0000


NaN or Inf found in input tensor.


Step 12750: Loss 0.0000


NaN or Inf found in input tensor.


Step 12800: Loss 0.0000


NaN or Inf found in input tensor.


Step 12850: Loss 0.0000


NaN or Inf found in input tensor.


Step 12900: Loss 0.0000


NaN or Inf found in input tensor.


Step 12950: Loss 0.0000


NaN or Inf found in input tensor.


Step 13000: Loss 0.0000


NaN or Inf found in input tensor.


Step 13050: Loss 0.0000


NaN or Inf found in input tensor.


Step 13100: Loss 0.0000


NaN or Inf found in input tensor.


Step 13150: Loss 0.0000


NaN or Inf found in input tensor.


Step 13200: Loss 0.0000


NaN or Inf found in input tensor.


Step 13250: Loss 0.0000


NaN or Inf found in input tensor.


Step 13300: Loss 0.0000


NaN or Inf found in input tensor.


Step 13350: Loss 0.0000


NaN or Inf found in input tensor.


Step 13400: Loss 0.0000


NaN or Inf found in input tensor.


Step 13450: Loss 0.0000


NaN or Inf found in input tensor.


Step 13500: Loss 0.0000


NaN or Inf found in input tensor.


Step 13550: Loss 0.0000


NaN or Inf found in input tensor.


Step 13600: Loss 0.0000


NaN or Inf found in input tensor.


Step 13650: Loss 0.0000


NaN or Inf found in input tensor.


Step 13700: Loss 0.0000


NaN or Inf found in input tensor.


Step 13750: Loss 0.0000


NaN or Inf found in input tensor.


Step 13800: Loss 0.0000


NaN or Inf found in input tensor.


Step 13850: Loss 0.0000


NaN or Inf found in input tensor.


Step 13900: Loss 0.0000


NaN or Inf found in input tensor.


Step 13950: Loss 0.0000


NaN or Inf found in input tensor.


Step 14000: Loss 0.0000


NaN or Inf found in input tensor.


Step 14050: Loss 0.0000


NaN or Inf found in input tensor.


Step 14100: Loss 0.0000


NaN or Inf found in input tensor.


Step 14150: Loss 0.0000


NaN or Inf found in input tensor.


Step 14200: Loss 0.0000


NaN or Inf found in input tensor.


Step 14250: Loss 0.0000


NaN or Inf found in input tensor.


Step 14300: Loss 0.0000


NaN or Inf found in input tensor.


Step 14350: Loss 0.0000


NaN or Inf found in input tensor.


Step 14400: Loss 0.0000


NaN or Inf found in input tensor.


Step 14450: Loss 0.0000


NaN or Inf found in input tensor.


Step 14500: Loss 0.0000


NaN or Inf found in input tensor.


Step 14550: Loss 0.0000


NaN or Inf found in input tensor.


Step 14600: Loss 0.0000


NaN or Inf found in input tensor.


Step 14650: Loss 0.0000


NaN or Inf found in input tensor.


Step 14700: Loss 0.0000


NaN or Inf found in input tensor.


Step 14750: Loss 0.0000


NaN or Inf found in input tensor.


Step 14800: Loss 0.0000


NaN or Inf found in input tensor.


Step 14850: Loss 0.0000


NaN or Inf found in input tensor.


Step 14900: Loss 0.0000


NaN or Inf found in input tensor.


Step 14950: Loss 0.0000


NaN or Inf found in input tensor.


Step 15000: Loss 0.0000


NaN or Inf found in input tensor.


Step 15050: Loss 0.0000


NaN or Inf found in input tensor.


Step 15100: Loss 0.0000


NaN or Inf found in input tensor.


Step 15150: Loss 0.0000


NaN or Inf found in input tensor.


Step 15200: Loss 0.0000


NaN or Inf found in input tensor.


Step 15250: Loss 0.0000


NaN or Inf found in input tensor.


Step 15300: Loss 0.0000


NaN or Inf found in input tensor.


Step 15350: Loss 0.0000


NaN or Inf found in input tensor.


Step 15400: Loss 0.0000


NaN or Inf found in input tensor.


Step 15450: Loss 0.0000


NaN or Inf found in input tensor.


Step 15500: Loss 0.0000


NaN or Inf found in input tensor.


Step 15550: Loss 0.0000


NaN or Inf found in input tensor.


Step 15600: Loss 0.0000


NaN or Inf found in input tensor.


Step 15650: Loss 0.0000


NaN or Inf found in input tensor.


Step 15700: Loss 0.0000


NaN or Inf found in input tensor.


Step 15750: Loss 0.0000


NaN or Inf found in input tensor.


Step 15800: Loss 0.0000


NaN or Inf found in input tensor.


Step 15850: Loss 0.0000


NaN or Inf found in input tensor.


Step 15900: Loss 0.0000


NaN or Inf found in input tensor.


Step 15950: Loss 0.0000


NaN or Inf found in input tensor.


Step 16000: Loss 0.0000


NaN or Inf found in input tensor.


Step 16050: Loss 0.0000


NaN or Inf found in input tensor.


Step 16100: Loss 0.0000


NaN or Inf found in input tensor.


Step 16150: Loss 0.0000


NaN or Inf found in input tensor.


Step 16200: Loss 0.0000


NaN or Inf found in input tensor.


Step 16250: Loss 0.0000


NaN or Inf found in input tensor.


Step 16300: Loss 0.0000


NaN or Inf found in input tensor.


Step 16350: Loss 0.0000


NaN or Inf found in input tensor.


Step 16400: Loss 0.0000


NaN or Inf found in input tensor.


Step 16450: Loss 0.0000


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation Loss: nan
Exact Match: 0.48%
F1 Score: 0.48%


('./DeepSeek-R1-Distill-Qwen-1.5B-trained/tokenizer_config.json',
 './DeepSeek-R1-Distill-Qwen-1.5B-trained/special_tokens_map.json',
 './DeepSeek-R1-Distill-Qwen-1.5B-trained/tokenizer.json')

In [6]:

print("\nRe-loading optimized teacher model")
teacher_model = AutoModelForQuestionAnswering.from_pretrained(".//DeepSeek-R1-Distill-Qwen-1.5B-trained")
teacher_tokenizer = AutoTokenizer.from_pretrained(".//DeepSeek-R1-Distill-Qwen-1.5B-trained")


Re-loading optimized teacher model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# Define evaluation function
def evaluate_model(model, tokenizer, dataset):
    model.to(device)
    
    metric = evaluate.load("squad")
    predictions = []
    references = []

    for example in tqdm(dataset, desc="Evaluating"):
        # Tokenize inputs
        inputs = tokenizer(
            example["context"], example["question"], truncation=True, padding=True, return_tensors="pt"
        )

        # Move inputs to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Get model outputs
        outputs = model(**inputs)
        start_logits, end_logits = outputs.start_logits, outputs.end_logits
        start_idx = torch.argmax(start_logits, dim=-1).item()
        end_idx = torch.argmax(end_logits, dim=-1).item()
        
        # Decode prediction
        prediction = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx + 1])
        
        # Append to predictions
        predictions.append({
            "id": example["id"],
            "prediction_text": prediction
        })

        # Append to references (ground truth)
        references.append({
            "id": example["id"],
            "answers": example["answers"]
        })

    # Compute metrics
    result = metric.compute(predictions=predictions, references=references)
    print(f"Exact Match: {result['exact_match']:.2f}%")
    print(f"F1 Score: {result['f1']:.2f}%\n")
    
    return result
    

In [8]:
def preprocess_validation_data(example):
    # Tokenize context and question
    inputs = student_tokenizer(
        example["context"],
        example["question"],
        truncation=True,
        padding="max_length",
        max_length=384,
    )
    return inputs

In [11]:
# Evaluate teacher model on validation set
import evaluate
from tqdm import tqdm

validation_dataset = squad["validation"]

#if debugging: 
#    validation_set = validation_dataset.select(range(5000))
#    validation_dataset = validation_set

print("Teacher Model Evaluation")
result = evaluate_model(teacher_model, teacher_tokenizer, validation_dataset)

Teacher Model Evaluation


Evaluating: 100%|██████████| 10570/10570 [04:18<00:00, 40.88it/s]


Exact Match: 0.00%
F1 Score: 0.00%

