# Pre-trained Model Evaluation: DistilBERT on IMDB

This notebook evaluates the pre-trained DistilBERT model (without fine-tuning) on the IMDB sentiment classification task.

**Configuration:**
- Base Model: DistilBERT (pre-trained, no fine-tuning)
- Dataset: IMDB (Movie Reviews)
- Split Ratio: Train:Val:Test = 8:1:1
- Random Seed: 42

**Metrics Tracked:**
- Test Accuracy & F1
- Validation Accuracy & F1
- Inference time
- GPU memory used
- Total parameters

In [1]:
# Import necessary libraries
import torch
import numpy as np
import random
import time
import json
import warnings
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score
import gc

# Filter out specific warnings
warnings.filterwarnings('ignore', category=UserWarning, module='torch.nn.parallel._functions')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Initial GPU Memory: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
GPU: NVIDIA H200 MIG 1g.18gb
Initial GPU Memory: 0.00 MB


In [2]:
# Set random seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
print("Random seed set to 42")

Random seed set to 42


In [3]:
# Load and prepare IMDB dataset
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")

# Get full training data
train_data = dataset["train"]
total_samples = len(train_data)

# Calculate split sizes (8:1:1)
train_size = int(0.8 * total_samples)
val_size = int(0.1 * total_samples)
test_size = total_samples - train_size - val_size

# Split the dataset
train_dataset = train_data.select(range(train_size))
val_dataset = train_data.select(range(train_size, train_size + val_size))
test_dataset = train_data.select(range(train_size + val_size, total_samples))

print(f"Dataset split complete:")
print(f"  Train: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

Loading SST-2 dataset...
Dataset split complete:
  Train: 53879 samples
  Validation: 6734 samples
  Test: 6736 samples
Dataset split complete:
  Train: 53879 samples
  Validation: 6734 samples
  Test: 6736 samples


In [5]:
# Tokenize the dataset
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

print("Tokenization complete!")

Loading tokenizer...
Tokenizing datasets...
Tokenization complete!
Tokenizing datasets...
Tokenization complete!


In [6]:
# Define compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="binary")
    }

In [7]:
# Function to evaluate pre-trained model (no training)
def evaluate_pretrained_model():
    print(f"\n{'='*80}")
    print(f"Evaluating Pre-trained DistilBERT Model")
    print(f"{'='*80}\n")
    
    # Reset seed
    set_seed(42)
    
    # Clear GPU memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Load pre-trained model (no fine-tuning)
    print("Loading pre-trained DistilBERT model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2
    )
    
    # Print model parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"\nModel Parameters:")
    print(f"  Total parameters: {total_params:,}\n")
    
    # Move model to device
    model.to(device)
    
    # Record GPU memory
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        initial_memory = torch.cuda.memory_allocated() / 1024**2
        print(f"GPU Memory after loading model: {initial_memory:.2f} MB")
    
    # Create trainer for evaluation only (no training)
    training_args = TrainingArguments(
        output_dir="./results_pretrained_imdb",
        per_device_eval_batch_size=16,
        seed=42,
        report_to="none",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
    )
    
    # Evaluate on validation set
    print("\nEvaluating on validation set...")
    start_time = time.time()
    val_results = trainer.evaluate(tokenized_val)
    val_time = time.time() - start_time
    val_accuracy = val_results['eval_accuracy']
    val_f1 = val_results.get('eval_f1', 0.0)
    
    # Evaluate on test set
    print("Evaluating on test set...")
    start_time = time.time()
    test_results = trainer.evaluate(tokenized_test)
    test_time = time.time() - start_time
    test_accuracy = test_results['eval_accuracy']
    test_f1 = test_results.get('eval_f1', 0.0)
    
    # Get peak GPU memory
    if torch.cuda.is_available():
        peak_memory = torch.cuda.max_memory_allocated() / 1024**2
        print(f"\nPeak GPU Memory during evaluation: {peak_memory:.2f} MB")
    else:
        peak_memory = 0
    
    # Compile results
    results = {
        "model_type": "pre-trained (no fine-tuning)",
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "val_accuracy": val_accuracy,
        "val_f1": val_f1,
        "total_parameters": total_params,
        "test_inference_time": test_time,
        "val_inference_time": val_time,
        "peak_gpu_memory_mb": peak_memory,
    }
    
    print(f"\n{'='*80}")
    print(f"Pre-trained Model Results:")
    print(f"  Test Accuracy: {test_accuracy:.4f}")
    print(f"  Test F1: {test_f1:.4f}")
    print(f"  Validation Accuracy: {val_accuracy:.4f}")
    print(f"  Validation F1: {val_f1:.4f}")
    print(f"  Test Inference Time: {test_time:.2f}s")
    print(f"  Val Inference Time: {val_time:.2f}s")
    print(f"  Peak GPU Memory: {peak_memory:.2f} MB")
    print(f"{'='*80}\n")
    
    # Clean up
    del model
    del trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return results
    print(f"  Time to Convergence: {time_to_convergence:.2f}s")
    print(f"  Peak GPU Memory: {peak_memory:.2f} MB")
    print(f"{'='*80}\n")
    
    # Clean up
    del model
    del trainer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    return results

In [8]:
# Evaluate pre-trained model (no training/fine-tuning)
print("Evaluating Pre-trained DistilBERT Model on IMDB...")
print(f"Random seed: 42\n")

results = evaluate_pretrained_model()

# Save results
with open("pretrained_imdb_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\n" + "="*80)
print("Evaluation completed!")
print("="*80)

Evaluating Pre-trained DistilBERT Model...
Random seed: 42


Evaluating Pre-trained DistilBERT Model



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading pre-trained DistilBERT model...

Model Parameters:
  Total parameters: 66,955,010

GPU Memory after loading model: 256.50 MB

Evaluating on validation set...
GPU Memory after loading model: 256.50 MB

Evaluating on validation set...


Evaluating on test set...

Peak GPU Memory during evaluation: 331.82 MB

Pre-trained Model Results:
  Test Accuracy: 0.4210
  Test F1: 0.0714
  Validation Accuracy: 0.4253
  Validation F1: 0.0547
  Test Inference Time: 14.14s
  Val Inference Time: 14.27s
  Peak GPU Memory: 331.82 MB


Evaluation completed!

Peak GPU Memory during evaluation: 331.82 MB

Pre-trained Model Results:
  Test Accuracy: 0.4210
  Test F1: 0.0714
  Validation Accuracy: 0.4253
  Validation F1: 0.0547
  Test Inference Time: 14.14s
  Val Inference Time: 14.27s
  Peak GPU Memory: 331.82 MB


Evaluation completed!


In [10]:
# Display results summary
import pandas as pd

# Create summary
summary_data = {
    "Model Type": results["model_type"],
    "Test Accuracy": f"{results['test_accuracy']:.4f}",
    "Test F1": f"{results['test_f1']:.4f}",
    "Val Accuracy": f"{results['val_accuracy']:.4f}",
    "Val F1": f"{results['val_f1']:.4f}",
    "Total Params": f"{results['total_parameters']:,}",
    "Test Inference Time (s)": f"{results['test_inference_time']:.2f}",
    "Val Inference Time (s)": f"{results['val_inference_time']:.2f}",
    "Peak GPU Memory (MB)": f"{results['peak_gpu_memory_mb']:.2f}"
}

summary_df = pd.DataFrame([summary_data])
print("\n" + "="*80)
print("PRE-TRAINED MODEL SUMMARY")
print("="*80)
print(summary_df.to_string(index=False))

# Save summary to CSV
summary_df.to_csv("pretrained_imdb_summary.csv", index=False)
print("\n✓ Summary saved to 'pretrained_imdb_summary.csv'")
print("✓ Complete results saved to 'pretrained_imdb_results.json'")


PRE-TRAINED MODEL SUMMARY
                  Model Type Test Accuracy Test F1 Val Accuracy Val F1 Total Params Test Inference Time (s) Val Inference Time (s) Peak GPU Memory (MB)
pre-trained (no fine-tuning)        0.4210  0.0714       0.4253 0.0547   66,955,010                   14.14                  14.27               331.82

✓ Summary saved to 'pretrained_summary.csv'
✓ Complete results saved to 'pretrained_results.json'
