In [None]:
!pip install transformers datasets torch torchvision torchaudio
!pip install scikit-learn numpy pandas tqdm
!pip install accelerate

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Memory-optimized configuration for T4 GPU
CONFIG = {
    # Model names
    'teacher_model': 'bert-base-uncased',
    'student_model': 'distilbert-base-uncased',

    # Memory-optimized settings
    'max_length': 128,  # Reduced sequence length
    'batch_size': 16,   # Moderate batch size for T4
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-5,
    'teacher_lr': 2e-5,
    'num_epochs': 3,
    'warmup_steps': 500,

    # Distillation parameters
    'temperature': 4.0,
    'alpha': 0.7,  # Weight for distillation loss
    'beta': 0.3,   # Weight for task loss

    # Data settings
    'dataset_name': 'sst2',
    'num_labels': 2,
    'max_train_samples': 5000,  # Reduced for faster training
    'max_eval_samples': 1000,

    # Output settings
    'teacher_output_dir': './teacher_model',
    'student_output_dir': './distilled_student_model',
    'logging_steps': 100,
    'save_steps': 500,
}


In [4]:
dataset = load_dataset('glue', 'sst2')

README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
train_dataset = dataset['train'].select(range(min(CONFIG['max_train_samples'], len(dataset['train']))))
eval_dataset = dataset['validation'].select(range(min(CONFIG['max_eval_samples'], len(dataset['validation']))))

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG['teacher_model'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    """Tokenize the examples"""
    return tokenizer(
        examples['sentence'],
        truncation=True,
        padding=True,
        max_length=CONFIG['max_length'],
        return_tensors='pt'
    )

In [8]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['sentence', 'idx']
)

tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['sentence', 'idx']
)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [9]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['teacher_model'],
    num_labels=CONFIG['num_labels'],
    torch_dtype=torch.float32
)

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
teacher_model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
# Training arguments for teacher
teacher_training_args = TrainingArguments(
    output_dir=CONFIG['teacher_output_dir'],
    num_train_epochs=CONFIG['num_epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    warmup_steps=CONFIG['warmup_steps'],
    learning_rate=CONFIG['teacher_lr'],
    logging_steps=CONFIG['logging_steps'],
    save_steps=CONFIG['save_steps'],
    eval_strategy="steps",  # Fixed: changed from evaluation_strategy
    eval_steps=CONFIG['save_steps'],
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=False,  # Disable for stability
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[],  # Disable wandb/tensorboard
)

In [16]:
def compute_metrics(eval_pred):
    """Compute accuracy for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
teacher_trainer = Trainer(
    model=teacher_model,
    args=teacher_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [19]:
torch.cuda.empty_cache()
gc.collect()

try:
    teacher_result = teacher_trainer.train()
    print("\n✅ Teacher training completed successfully!")
    print(f"Final training loss: {teacher_result.training_loss:.4f}")

    # Evaluate teacher model
    teacher_eval_result = teacher_trainer.evaluate()
    print(f"Teacher accuracy: {teacher_eval_result['eval_accuracy']:.4f}")

    # Save teacher model
    teacher_trainer.save_model()
    print(f"Teacher model saved to: {CONFIG['teacher_output_dir']}")

except Exception as e:
    print(f"❌ Teacher training failed: {e}")
    raise e

Step,Training Loss,Validation Loss



✅ Teacher training completed successfully!
Final training loss: 0.3990


Teacher accuracy: 0.9106
Teacher model saved to: ./teacher_model


In [20]:
student_model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['student_model'],
    num_labels=CONFIG['num_labels'],
    torch_dtype=torch.float32
)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
student_model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [22]:
# Knowledge distillation loss function (no classes, pure functions)
def distillation_loss(student_logits, teacher_logits, labels, temperature, alpha, beta):
    """
    Compute knowledge distillation loss

    Args:
        student_logits: Logits from student model
        teacher_logits: Logits from teacher model
        labels: Ground truth labels
        temperature: Temperature for softmax
        alpha: Weight for distillation loss
        beta: Weight for task loss
    """
    # Task loss (standard cross-entropy)
    task_loss = F.cross_entropy(student_logits, labels)

    # Distillation loss (KL divergence with temperature scaling)
    student_soft = F.log_softmax(student_logits / temperature, dim=-1)
    teacher_soft = F.softmax(teacher_logits / temperature, dim=-1)

    kd_loss = F.kl_div(student_soft, teacher_soft, reduction='batchmean') * (temperature ** 2)

    # Combined loss
    total_loss = alpha * kd_loss + beta * task_loss

    return {
        'total_loss': total_loss,
        'task_loss': task_loss,
        'distillation_loss': kd_loss
    }

In [27]:
# Custom trainer for knowledge distillation (no classes, inheritance-based approach)
class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, temperature, alpha, beta, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.temperature = temperature
        self.alpha = alpha
        self.beta = beta

        # Set teacher to evaluation mode
        self.teacher_model.eval()
        for param in self.teacher_model.parameters():
            param.requires_grad = False

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """Custom loss computation with knowledge distillation"""
        labels = inputs.get("labels")

        # Filter inputs for student model (DistilBERT doesn't use token_type_ids)
        student_inputs = {k: v for k, v in inputs.items() if k != 'token_type_ids'}

        # Student forward pass
        student_outputs = model(**student_inputs)
        student_logits = student_outputs.logits

        # Teacher forward pass (no gradients) - BERT can use all inputs
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits

        # Compute distillation loss
        loss_dict = distillation_loss(
            student_logits=student_logits,
            teacher_logits=teacher_logits,
            labels=labels,
            temperature=self.temperature,
            alpha=self.alpha,
            beta=self.beta
        )

        # Log individual losses
        if self.state.log_history:
            try:
                self.log({
                    "task_loss": loss_dict['task_loss'].item(),
                    "distillation_loss": loss_dict['distillation_loss'].item(),
                })
            except:
                pass

        return (loss_dict['total_loss'], student_outputs) if return_outputs else loss_dict['total_loss']

In [28]:
# Training arguments for student (distillation)
distillation_training_args = TrainingArguments(
    output_dir=CONFIG['student_output_dir'],
    num_train_epochs=CONFIG['num_epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    warmup_steps=CONFIG['warmup_steps'],
    learning_rate=CONFIG['learning_rate'],
    logging_steps=CONFIG['logging_steps'],
    save_steps=CONFIG['save_steps'],
    eval_strategy="steps",  # Fixed: changed from evaluation_strategy
    eval_steps=CONFIG['save_steps'],
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=False,  # Disable for stability
    dataloader_num_workers=0,
    remove_unused_columns=False,
    report_to=[],  # Disable external logging
)


In [29]:
# Create distillation trainer
distillation_trainer = DistillationTrainer(
    teacher_model=teacher_model,
    temperature=CONFIG['temperature'],
    alpha=CONFIG['alpha'],
    beta=CONFIG['beta'],
    model=student_model,
    args=distillation_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [30]:
torch.cuda.empty_cache()
gc.collect()

try:
    student_result = distillation_trainer.train()
    print("\n✅ Knowledge distillation training completed successfully!")
    print(f"Final training loss: {student_result.training_loss:.4f}")

    # Evaluate student model
    student_eval_result = distillation_trainer.evaluate()
    print(f"Student accuracy: {student_eval_result['eval_accuracy']:.4f}")

    # Save student model
    distillation_trainer.save_model()
    print(f"Distilled student model saved to: {CONFIG['student_output_dir']}")

    # Performance comparison
    print(f"\n📊 Performance Comparison:")
    print(f"Teacher accuracy: {teacher_eval_result['eval_accuracy']:.4f}")
    print(f"Student accuracy: {student_eval_result['eval_accuracy']:.4f}")

    accuracy_retention = (student_eval_result['eval_accuracy'] / teacher_eval_result['eval_accuracy']) * 100
    print(f"Accuracy retention: {accuracy_retention:.1f}%")

except Exception as e:
    print(f"❌ Distillation training failed: {e}")
    raise e

Step,Training Loss,Validation Loss



✅ Knowledge distillation training completed successfully!
Final training loss: 0.6075


Student accuracy: 0.8876
Distilled student model saved to: ./distilled_student_model

📊 Performance Comparison:
Teacher accuracy: 0.9106
Student accuracy: 0.8876
Accuracy retention: 97.5%


In [31]:
# Fixed inference function (no repetition issues)
def predict_sentiment(text, model, tokenizer, max_length=128):
    """
    Predict sentiment for a given text (FIXED - no repetition issues)

    Args:
        text: Input text
        model: Trained model
        tokenizer: Tokenizer
        max_length: Maximum sequence length

    Returns:
        prediction: 'Positive' or 'Negative'
        confidence: Confidence score (0-1)
        logits: Raw model outputs
    """
    # Set model to evaluation mode
    model.eval()

    # Tokenize input (proper tokenization without issues)
    inputs = tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=max_length
    )

    # Filter inputs based on model type (DistilBERT doesn't use token_type_ids)
    model_name = model.__class__.__name__
    if 'DistilBert' in model_name and 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    # Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

        # Get probabilities
        probabilities = F.softmax(logits, dim=-1)

        # Get prediction
        predicted_label = torch.argmax(probabilities, dim=-1).item()
        confidence = torch.max(probabilities, dim=-1).values.item()

    # Convert to readable format
    prediction = "Positive" if predicted_label == 1 else "Negative"

    return prediction, confidence, logits.cpu().numpy()

In [32]:
# Test examples for inference
test_examples = [
    "This movie was absolutely fantastic and amazing!",
    "I hated this film, it was terrible and boring.",
    "The movie was okay, nothing special but watchable.",
    "Best movie I've ever seen in my entire life!",
    "Worst experience ever, completely disappointing.",
    "The acting was brilliant and the story was compelling.",
    "Poor direction and weak screenplay ruined it.",
    "A masterpiece of cinema with excellent performances."
]

In [33]:
print("🧑‍🏫 Testing Teacher Model (BERT) Predictions:")
print("=" * 60)

teacher_predictions = []
teacher_model.eval()

for i, text in enumerate(test_examples):
    prediction, confidence, logits = predict_sentiment(text, teacher_model, tokenizer)
    teacher_predictions.append((prediction, confidence))

    print(f"\nExample {i+1}: '{text}'")
    print(f"Prediction: {prediction} (confidence: {confidence:.4f})")

🧑‍🏫 Testing Teacher Model (BERT) Predictions:

Example 1: 'This movie was absolutely fantastic and amazing!'
Prediction: Positive (confidence: 0.9893)

Example 2: 'I hated this film, it was terrible and boring.'
Prediction: Negative (confidence: 0.9801)

Example 3: 'The movie was okay, nothing special but watchable.'
Prediction: Positive (confidence: 0.9700)

Example 4: 'Best movie I've ever seen in my entire life!'
Prediction: Positive (confidence: 0.9649)

Example 5: 'Worst experience ever, completely disappointing.'
Prediction: Negative (confidence: 0.9824)

Example 6: 'The acting was brilliant and the story was compelling.'
Prediction: Positive (confidence: 0.9894)

Example 7: 'Poor direction and weak screenplay ruined it.'
Prediction: Negative (confidence: 0.9799)

Example 8: 'A masterpiece of cinema with excellent performances.'
Prediction: Positive (confidence: 0.9897)


In [35]:
print("🎓 Testing Student Model (DistilBERT) Predictions:")
print("=" * 60)

student_predictions = []
student_model.eval()

for i, text in enumerate(test_examples):
    prediction, confidence, logits = predict_sentiment(text, student_model, tokenizer)
    student_predictions.append((prediction, confidence))

    print(f"\nExample {i+1}: '{text}'")
    print(f"Prediction: {prediction} (confidence: {confidence:.4f})")

🎓 Testing Student Model (DistilBERT) Predictions:

Example 1: 'This movie was absolutely fantastic and amazing!'
Prediction: Positive (confidence: 0.9850)

Example 2: 'I hated this film, it was terrible and boring.'
Prediction: Negative (confidence: 0.9820)

Example 3: 'The movie was okay, nothing special but watchable.'
Prediction: Positive (confidence: 0.9766)

Example 4: 'Best movie I've ever seen in my entire life!'
Prediction: Positive (confidence: 0.9655)

Example 5: 'Worst experience ever, completely disappointing.'
Prediction: Negative (confidence: 0.9856)

Example 6: 'The acting was brilliant and the story was compelling.'
Prediction: Positive (confidence: 0.9901)

Example 7: 'Poor direction and weak screenplay ruined it.'
Prediction: Negative (confidence: 0.9859)

Example 8: 'A masterpiece of cinema with excellent performances.'
Prediction: Positive (confidence: 0.9912)
