In [None]:
#Max's original validation code

import os
import time
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import AutoModelForImageClassification

# Load CIFAR-100 test dataset
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761))
])

test_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=False, download=True, transform=transform
)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2)

# Load the model
model_name = "jialicheng/cifar100-resnet-50"
model = AutoModelForImageClassification.from_pretrained(model_name)

# Move the model to the appropriate device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)  # Move model to GPU if available
model.eval()  # Set the model to evaluation mode

criterion = nn.CrossEntropyLoss()

def test_model_performance(model, test_loader):
    total = 0
    correct = 0
    running_loss = 0.0
    inference_times = []

    model.eval()
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)  

            start_time = time.time()
            outputs = model(images).logits
            end_time = time.time()

            inference_times.append(end_time - start_time)

            loss = criterion(outputs, labels)
            running_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    avg_loss = running_loss / len(test_loader)
    avg_inference_time = sum(inference_times) / len(inference_times)

    print(f'Accuracy: {accuracy:.2f}%, Average Loss: {avg_loss:.4f}')

    return accuracy, avg_loss, avg_inference_time

def get_model_size(model):
    temp_path = "temp.pth"
    torch.save(model.state_dict(), temp_path)
    size_in_mb = os.path.getsize(temp_path) / (1024 * 1024)
    os.remove(temp_path)  # Clean up the temporary file
    return size_in_mb

#def save_model(model, save_path):
    #"""Save the model's state dictionary to the specified path."""
    #torch.save(model.state_dict(), save_path)
    #print(f'Model saved to {save_path}')

print("Testing the compressed model...")
accuracy, avg_loss, avg_inference_time = test_model_performance(model, test_loader)

model_size = get_model_size(model)
print(f'Model Size: {model_size:.2f} MB')

# Example baseline values (replace with your actual baseline results)
baseline_accuracy = 75.00
baseline_inference_time = 0.08
baseline_model_size = 100.0

accuracy_drop = baseline_accuracy - accuracy
speed_improvement = (baseline_inference_time - avg_inference_time) / baseline_inference_time * 100
size_reduction = (baseline_model_size - model_size) / baseline_model_size * 100

print("\nPerformance Comparison:")
print(f'Accuracy Drop: {accuracy_drop:.2f}%')
print(f'Speed Improvement: {speed_improvement:.2f}%')
print(f'Size Reduction: {size_reduction:.2f}%')

# Save the model after testing
#save_path = "cifar100_resnet50_model.pth" 
#save_model(model, save_path)

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, datasets, transforms
from torch.utils.data import DataLoader
import timm
import os
import time
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from transformers import AutoModelForImageClassification

# ====================================================
# Dataset Preparation: CIFAR-100
# ====================================================
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5071, 0.4867, 0.4408), (0.2675, 0.2565, 0.2761)),
])

train_dataset = datasets.CIFAR100(root='./data', train=True, transform=transform, download=True)
test_dataset = datasets.CIFAR100(root='./data', train=False, transform=transform, download=True)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [2]:
class DistillationLoss(nn.Module):
    def __init__(self, temperature=4.0, alpha=0.5):
        super(DistillationLoss, self).__init__()
        self.temperature = temperature
        self.alpha = alpha
        self.criterion_ce = nn.CrossEntropyLoss()

    def forward(self, student_logits, teacher_logits, labels):
        # Soft loss
        soft_loss = nn.KLDivLoss(reduction='batchmean')(
            torch.log_softmax(student_logits / self.temperature, dim=1),
            torch.softmax(teacher_logits / self.temperature, dim=1)
        )
        # Hard loss
        hard_loss = self.criterion_ce(student_logits, labels)
        return self.alpha * soft_loss * (self.temperature ** 2) + (1.0 - self.alpha) * hard_loss

In [3]:
#when all layers are trained
def train_kd_res_all(teacher_model, student_model, train_loader, epochs=10, lr=1e-3, temperature=4.0, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    teacher_model.to(device).eval()
    student_model.to(device).train()

    optimizer = optim.Adam(student_model.parameters(), lr=lr)
    distillation_loss_fn = DistillationLoss(temperature, alpha)

    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher model predictions (no gradient needed)
            with torch.no_grad():
                teacher_logits = teacher_model(images).logits

            # Student model predictions
            student_logits = student_model(images)

            # Calculate loss
            loss = distillation_loss_fn(student_logits, teacher_logits, labels)

            # Update optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(student_logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Calculate average loss and accuracy for the epoch
        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = 100 * total_correct / total_samples

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

In [4]:
#when all layers are trained
def train_kd_net_all(teacher_model, student_model, train_loader, epochs=10, lr=1e-3, temperature=4.0, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    teacher_model.to(device).eval()
    student_model.to(device).train()

    optimizer = optim.Adam(student_model.parameters(), lr=lr)
    distillation_loss_fn = DistillationLoss(temperature, alpha)

    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher model predictions (no gradient needed)
            with torch.no_grad():
                teacher_logits = teacher_model(images)

            # Student model predictions
            student_logits = student_model(images)

            # Calculate loss
            loss = distillation_loss_fn(student_logits, teacher_logits, labels)

            # Update optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(student_logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Calculate average loss and accuracy for the epoch
        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = 100 * total_correct / total_samples

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

In [5]:
def train_kd_res_classifier(teacher_model, student_model, train_loader, epochs=10, lr=1e-3, temperature=4.0, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    teacher_model.to(device).eval()
    student_model.to(device).train()

    # ================================
    # Freeze all layers except the classifier head
    # ================================
    for name, param in student_model.named_parameters():
        if "fc" not in name and "classifier" not in name:  # Adjust for different models
            param.requires_grad = False

    # Use an optimizer that only updates trainable parameters
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, student_model.parameters()), lr=lr)
    
    distillation_loss_fn = DistillationLoss(temperature, alpha)

    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher model predictions (no gradient needed)
            with torch.no_grad():
                teacher_logits = teacher_model(images).logits

            # Student model predictions
            student_logits = student_model(images)

            # Calculate loss
            loss = distillation_loss_fn(student_logits, teacher_logits, labels)

            # Update optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(student_logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Calculate average loss and accuracy for the epoch
        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = 100 * total_correct / total_samples

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

In [6]:
def train_kd_net_classifier(teacher_model, student_model, train_loader, epochs=10, lr=1e-3, temperature=4.0, alpha=0.5):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    teacher_model.to(device).eval()
    student_model.to(device).train()

    # ================================
    # Freeze all layers except the classifier head
    # ================================
    for name, param in student_model.named_parameters():
        if "fc" not in name and "classifier" not in name:  # Adjust for different models
            param.requires_grad = False

    # Use an optimizer that only updates trainable parameters
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, student_model.parameters()), lr=lr)
    
    distillation_loss_fn = DistillationLoss(temperature, alpha)

    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0
        total_samples = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            # Teacher model predictions (no gradient needed)
            with torch.no_grad():
                teacher_logits = teacher_model(images)

            # Student model predictions
            student_logits = student_model(images)

            # Calculate loss
            loss = distillation_loss_fn(student_logits, teacher_logits, labels)

            # Update optimizer
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Accumulate loss
            total_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(student_logits, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

        # Calculate average loss and accuracy for the epoch
        epoch_loss = total_loss / len(train_loader)
        epoch_accuracy = 100 * total_correct / total_samples

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%")

In [7]:
def evaluate(model, test_loader):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device).eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(probs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Accuracy: {accuracy:.2f}%')

In [8]:
import os, time, torch, torch.nn as nn

def evaluate_model_performance_res(model, test_loader, baseline_accuracy=75.00, baseline_inference_time=0.10, baseline_model_size=100.0):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device).eval()
    criterion, total, correct, running_loss, inference_times = nn.CrossEntropyLoss(), 0, 0, 0.0, []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            start_time = time.time(); outputs = model(images); end_time = time.time()
            inference_times.append(end_time - start_time)
            running_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1); total += labels.size(0); correct += (predicted == labels).sum().item()

    accuracy, avg_loss, avg_inference_time = 100 * correct / total, running_loss / len(test_loader), sum(inference_times) / len(inference_times)
    model_size = os.path.getsize("temp.pth") / (1024 * 1024) if torch.save(model.state_dict(), "temp.pth") or os.path.exists("temp.pth") else 0; os.remove("temp.pth")
    accuracy_drop, speed_improvement, size_reduction = baseline_accuracy - accuracy, (baseline_inference_time - avg_inference_time) / baseline_inference_time * 100, (baseline_model_size - model_size) / baseline_model_size * 100

    print(f"Accuracy: {accuracy}%, Average Loss: {avg_loss}, Model Size: {model_size} MB, Average Inference Time: {avg_inference_time}s\nPerformance Comparison:\nAccuracy Drop: {accuracy_drop}%, Speed Improvement: {speed_improvement}%, Size Reduction: {size_reduction}%")
    return accuracy, avg_loss, avg_inference_time, model_size

#----------------------------------------------------------------------------------------------------------------------------------------

def evaluate_model_performance_mobile(model, test_loader, baseline_accuracy=71.00, baseline_inference_time=0.05, baseline_model_size=20.0):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device).eval()
    criterion, total, correct, running_loss, inference_times = nn.CrossEntropyLoss(), 0, 0, 0.0, []

    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            start_time = time.time(); outputs = model(images); end_time = time.time()
            inference_times.append(end_time - start_time)
            running_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1); total += labels.size(0); correct += (predicted == labels).sum().item()

    accuracy, avg_loss, avg_inference_time = 100 * correct / total, running_loss / len(test_loader), sum(inference_times) / len(inference_times)
    model_size = os.path.getsize("temp.pth") / (1024 * 1024) if torch.save(model.state_dict(), "temp.pth") or os.path.exists("temp.pth") else 0; os.remove("temp.pth")
    accuracy_drop, speed_improvement, size_reduction = baseline_accuracy - accuracy, (baseline_inference_time - avg_inference_time) / baseline_inference_time * 100, (baseline_model_size - model_size) / baseline_model_size * 100

    print(f"Accuracy: {accuracy}%, Average Loss: {avg_loss}, Model Size: {model_size} MB, Average Inference Time: {avg_inference_time}s\nPerformance Comparison:\nAccuracy Drop: {accuracy_drop}%, Speed Improvement: {speed_improvement}%, Size Reduction: {size_reduction}%")
    return accuracy, avg_loss, avg_inference_time, model_size

def save_model(model, save_path):
    """Save the model's state dictionary to the specified path."""
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to {save_path}')


In [9]:
# ====================================================
# Teacher Model: Aznaur's ResNet-50
# ====================================================
model_name = "jialicheng/cifar100-resnet-50"
teacher_model = AutoModelForImageClassification.from_pretrained(model_name)

# ====================================================
# Student Model: ResNet-18
# ====================================================
student_model = models.resnet18(pretrained=True)
student_model.fc = nn.Linear(student_model.fc.in_features, 100)  # Adjust for CIFAR-100
student_model.train()

if __name__ == '__main__':
    train_kd_res_all(teacher_model, student_model, train_loader, epochs=5, lr=3e-4, temperature=4.0, alpha=0.5)
    evaluate(student_model, test_loader)

2025-01-17 20:17:30.161088: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-17 20:17:32.010005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737134252.580133 3796820 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737134252.761396 3796820 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-17 20:17:34.362836: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Epoch [1/5], Loss: 1.8602, Accuracy: 62.84%
Epoch [2/5], Loss: 0.8469, Accuracy: 80.40%
Epoch [3/5], Loss: 0.6383, Accuracy: 85.92%
Epoch [4/5], Loss: 0.5176, Accuracy: 89.77%
Epoch [5/5], Loss: 0.4412, Accuracy: 92.25%
Test Accuracy: 80.16%


In [10]:
accuracy, avg_loss, avg_inference_time, model_size = evaluate_model_performance_res(student_model, test_loader)

Accuracy: 80.09%, Average Loss: 0.6681001088287257, Model Size: 42.899356842041016 MB, Average Inference Time: 0.002709077883370315s
Performance Comparison:
Accuracy Drop: -5.090000000000003%, Speed Improvement: 97.29092211662969%, Size Reduction: 57.100643157958984%


In [11]:
save_path = "Student_Models/cifar100_resnet18.pth" 
save_model(student_model, save_path)

Model saved to Student_Models/cifar100_resnet18.pth


In [88]:
# ====================================================
# Teacher Model: EfficientNet B5 (from timm)
# ====================================================
teacher_model = timm.create_model('efficientnet_b5', pretrained=True, num_classes=100)
teacher_model.eval()

# ====================================================
# Student Model: EfficientNet-Lite0 (from timm)
# ====================================================
student_model = timm.create_model('efficientnet_lite0', pretrained=True, num_classes=100)
student_model.train()

if __name__ == '__main__':
    train_kd_net_all(teacher_model, student_model, train_loader, epochs=5, lr=3e-4, temperature=4.0, alpha=0.5)
    evaluate(student_model, test_loader)


Epoch [1/5], Loss: 1.4741, Accuracy: 57.79%
Epoch [2/5], Loss: 0.9509, Accuracy: 80.96%
Epoch [3/5], Loss: 0.8037, Accuracy: 88.11%
Epoch [4/5], Loss: 0.7146, Accuracy: 92.43%
Epoch [5/5], Loss: 0.6546, Accuracy: 95.14%
Test Accuracy: 81.85%


In [89]:
accuracy, avg_loss, avg_inference_time, model_size = evaluate_model_performance_net(student_model, test_loader)

Accuracy: 81.76%, Average Loss: 0.8752514659603939, Model Size: 13.591398239135742 MB, Average Inference Time: 0.005031636998623233s
Performance Comparison:
Accuracy Drop: -10.760000000000005%, Speed Improvement: 89.93672600275355%, Size Reduction: 32.04300880432129%


In [90]:
save_path = "Student_Models/cifar100_efficientnet_lite0.pth" 
save_model(student_model, save_path)

Model saved to Student_Models/cifar100_efficientnet_lite0.pth


In [71]:
# ====================================================
# Teacher Model: EfficientNet-B5 (from timm)
# ====================================================
teacher_model = timm.create_model('efficientnet_b5', pretrained=True, num_classes=100)
teacher_model.eval()

# ====================================================
# Student Model: MobileNetV3-Large (from timm)
# ====================================================
student_model = timm.create_model('mobilenetv3_large_100', pretrained=True, num_classes=100)
student_model.train()

if __name__ == '__main__':
    train_kd_net_all(teacher_model, student_model, train_loader, epochs=5, lr=3e-4, temperature=4.0, alpha=0.5)
    evaluate(student_model, test_loader)

Epoch [1/5], Loss: 1.3664, Accuracy: 63.06%
Epoch [2/5], Loss: 0.9022, Accuracy: 83.13%
Epoch [3/5], Loss: 0.7662, Accuracy: 89.25%
Epoch [4/5], Loss: 0.6856, Accuracy: 93.01%
Epoch [5/5], Loss: 0.6274, Accuracy: 95.65%
Test Accuracy: 82.79%


In [72]:
accuracy, avg_loss, avg_inference_time, model_size = evaluate_model_performance_mobile(student_model, test_loader)

Accuracy: 82.75%, Average Loss: 0.8692005895361116, Model Size: 16.70149040222168 MB, Average Inference Time: 0.006251126905030842s
Performance Comparison:
Accuracy Drop: -11.75%, Speed Improvement: 87.49774618993831%, Size Reduction: 16.4925479888916%


In [73]:
# Save the model after testing
save_path = "Student_Models/cifar100_mobilenetv3_large_100_model.pth" 
save_model(student_model, save_path)

Model saved to Student_Models/cifar100_mobilenetv3_large_100_model.pth


In [74]:
# ====================================================
# Teacher Model: EfficientNet-B5 (from timm)
# ====================================================
teacher_model = timm.create_model('efficientnet_b5', pretrained=True, num_classes=100)
teacher_model.eval()

# ====================================================
# Student Model: MobileNetV3-Small (from timm)
# ====================================================
student_model = timm.create_model('mobilenetv3_small_100', pretrained=True, num_classes=100)
student_model.train()

if __name__ == '__main__':
    train_kd_net_all(teacher_model, student_model, train_loader, epochs=5, lr=3e-4, temperature=4.0, alpha=0.5)
    evaluate(student_model, test_loader)

Epoch [1/5], Loss: 1.5223, Accuracy: 54.60%
Epoch [2/5], Loss: 1.0790, Accuracy: 73.98%
Epoch [3/5], Loss: 0.9515, Accuracy: 79.60%
Epoch [4/5], Loss: 0.8751, Accuracy: 83.32%
Epoch [5/5], Loss: 0.8178, Accuracy: 86.11%
Test Accuracy: 72.97%


In [75]:
accuracy, avg_loss, avg_inference_time, model_size = evaluate_model_performance_mobile(student_model, test_loader)

Accuracy: 72.93%, Average Loss: 1.2232606161998798, Model Size: 6.297517776489258 MB, Average Inference Time: 0.005306020567688761s
Performance Comparison:
Accuracy Drop: -1.9300000000000068%, Speed Improvement: 89.38795886462249%, Size Reduction: 68.51241111755371%


In [76]:
save_path = "Student_Models/cifar100_mobilenetv3_small_100.pth" 
save_model(student_model, save_path)

Model saved to Student_Models/cifar100_mobilenetv3_small_100.pth
