#Finetune pretrained teacher

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
from torchvision.models import resnet50
from torchvision import models
from torch import nn

In [None]:
teacher_model = models.resnet50(weights='IMAGENET1K_V1')
num_ftrs = teacher_model.fc.in_features

# Change last layer to output 2 classes (cat, dog)
teacher_model.fc = nn.Linear(num_ftrs, 2)
teacher_model = teacher_model.to(device)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 203MB/s]


In [None]:
# Freeze all except last block + fc
for name, param in teacher_model.named_parameters():
    if "layer4" not in name and "fc" not in name:
        param.requires_grad = False

optimizer = torch.optim.Adam([
    {'params': teacher_model.layer4.parameters(), 'lr': 1e-4},
    {'params': teacher_model.fc.parameters(), 'lr': 1e-3}
], weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()


In [None]:
# Define paths to your labeled datasets
finetune_dir = '/content/drive/MyDrive/pets/finetune_train'
val_dir = '/content/drive/MyDrive/pets/val'

IMG_SIZE = 224  # Standard input size for ResNet models
BATCH_SIZE = 64

# Define transforms for training with augmentation
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)), # Add RandomResizedCrop
    transforms.RandomHorizontalFlip(),        # Randomly flip images horizontally
    transforms.RandomRotation(15),            # Add RandomRotation
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), # Add ColorJitter
    transforms.ToTensor(),                    # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize with ImageNet stats
                         std=[0.229, 0.224, 0.225])
])

# Define transforms for validation without augmentation
transform_val = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),  # Resize images
    transforms.ToTensor(),                    # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # Normalize with ImageNet stats
                         std=[0.229, 0.224, 0.225])
])

# Create ImageFolder datasets
finetune_dataset = ImageFolder(finetune_dir, transform=transform_train)
val_dataset = ImageFolder(val_dir, transform=transform_val)

# Create DataLoaders
finetune_loader = DataLoader(finetune_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print("Finetune dataset size:", len(finetune_dataset))
print("Validation dataset size:", len(val_dataset))

Finetune dataset size: 420
Validation dataset size: 180


In [None]:
teacher_model = models.resnet50(weights='ResNet50_Weights.IMAGENET1K_V1')
num_ftrs = teacher_model.fc.in_features

# Change last layer to output 2 classes (cat, dog)
teacher_model.fc = nn.Linear(num_ftrs, 2)
teacher_model = teacher_model.to(device)


In [None]:
# Freeze all except last block + fc
for name, param in teacher_model.named_parameters():
    if "layer4" not in name and "fc" not in name:
        param.requires_grad = False

optimizer = torch.optim.Adam([
    {'params': teacher_model.layer4.parameters(), 'lr': 1e-4},
    {'params': teacher_model.fc.parameters(), 'lr': 1e-3}
], weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()


In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    teacher_model.train()
    running_loss = 0
    correct = 0
    total = 0

    for imgs, labels in dataloader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc = correct / total
    return epoch_loss, epoch_acc

def validate(model, dataloader, criterion, device):
    teacher_model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc


In [None]:
for epoch in range(5):
    train_loss, train_acc = train_epoch(teacher_model, finetune_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(teacher_model, val_loader, criterion, device)

    print(f'Epoch {epoch+1}: '
          f'Train Loss: {train_loss:.4f} Acc: {train_acc:.4f} | '
          f'Val Loss: {val_loss:.4f} Acc: {val_acc:.4f}')


Epoch 1: Train Loss: 0.2912 Acc: 0.8810 | Val Loss: 0.0462 Acc: 0.9778
Epoch 2: Train Loss: 0.0463 Acc: 0.9857 | Val Loss: 0.0556 Acc: 0.9778
Epoch 3: Train Loss: 0.0266 Acc: 0.9905 | Val Loss: 0.1259 Acc: 0.9778
Epoch 4: Train Loss: 0.0108 Acc: 0.9952 | Val Loss: 0.1084 Acc: 0.9778
Epoch 5: Train Loss: 0.0138 Acc: 0.9952 | Val Loss: 0.1097 Acc: 0.9722


In [None]:
import os
save_path = '/content/drive/MyDrive/mods/resnet_finetune_only.pth'
os.makedirs(os.path.dirname(save_path), exist_ok=True)

In [None]:
torch.save(teacher_model.state_dict(), save_path)

#3000set

##Distillation

In [None]:
import torchvision.models as models
import torch.nn as nn

# Instantiate MobileNetV2 without pretrained weights
student = models.mobilenet_v2(weights=None)

# Replace the default classifier with a new linear layer for 2 classes
# The last_channel attribute gives the input features to the original classifier
num_classes = 2
student.classifier[1] = nn.Linear(student.last_channel, num_classes)

print("MobileNetV2 student model defined with classification head.")

MobileNetV2 student model defined with classification head.


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch

# Define the path to your saved finetuned teacher model checkpoint
finetuned_checkpoint_path = save_path

# Load a standard ResNet50 model structure
teacher_model = models.resnet50(weights=None) # Load without pretrained ImageNet weights initially

# Modify the final fully connected layer to match the number of classes
num_ftrs = teacher_model.fc.in_features
num_classes = 2  # Your model was finetuned for 2 classes (Cat/Dog)
teacher_model.fc = nn.Linear(num_ftrs, num_classes)


# Load the state dictionary from the saved finetuned teacher model checkpoint
# Using map_location='cpu' to load onto CPU first is safer, then move to device
teacher_state_dict = torch.load(finetuned_checkpoint_path, map_location='cpu')

# Load the state dictionary into the standard ResNet50 model
# This should now work because the model structure matches the saved state_dict
teacher_model.load_state_dict(teacher_state_dict)

# Set the teacher model to evaluation mode
teacher_model.eval()

# Freeze the teacher model parameters
for param in teacher_model.parameters():
    param.requires_grad = False

# Determine the device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the teacher model to the device
teacher_model = teacher_model.to(device)

print("Finetuned teacher model loaded for distillation.")

Finetuned teacher model loaded for distillation.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- transforms ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- datasets ---
unlabeled = '/content/drive/MyDrive/pets/train3000'
labeled = '/content/drive/MyDrive/pets/finetune_train'
val = '/content/drive/MyDrive/pets/val'

# Load datasets
labeled_dataset = datasets.ImageFolder(labeled, transform=train_transform)
unlabeled_dataset = datasets.ImageFolder(unlabeled, transform=train_transform)


# Replace labels for unlabeled samples with -1
unlabeled_dataset.samples = [(path, -1) for (path, _) in unlabeled_dataset.samples]

BATCH_SIZE = 64

# Combine
combined_dataset = ConcatDataset([labeled_dataset, unlabeled_dataset])
train_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

val_dataset = datasets.ImageFolder(val, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of images in the labeled dataset: {len(labeled_dataset)}")
print(f"Number of images in the unlabeled dataset: {len(unlabeled_dataset)}")
print(f"Number of images in the val dataset: {len(val_dataset)}")

Number of images in the labeled dataset: 420
Number of images in the unlabeled dataset: 3000
Number of images in the val dataset: 180


In [None]:
ce_loss = nn.CrossEntropyLoss()

def kd_loss(student_logits, teacher_logits, T):
    """KL divergence loss for soft logits."""
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

In [None]:
def train_distillation_epoch(student_model, teacher_model, dataloader, criterion_ce, criterion_kd, optimizer, T, device, alpha):
    student_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Get teacher and student outputs
        with torch.no_grad():
            teacher_logits = teacher_model(inputs)
        student_logits = student_model(inputs)

        # Identify labeled and unlabeled samples
        labeled_mask = (labels != -1)
        unlabeled_mask = (labels == -1)

        # Calculate loss for labeled data (Cross-Entropy)
        ce_loss = criterion_ce(student_logits[labeled_mask], labels[labeled_mask]) if labeled_mask.sum() > 0 else 0

        # Calculate loss for unlabeled data (KL Divergence)
        kd_loss_val = criterion_kd(student_logits[unlabeled_mask], teacher_logits[unlabeled_mask], T) if unlabeled_mask.sum() > 0 else 0

        # Combine losses
        loss = (1 - alpha) * ce_loss + alpha * kd_loss_val

        # Backpropagate and optimize
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * inputs.size(0)
        # For accuracy, only consider labeled data
        if labeled_mask.sum() > 0:
            _, preds = torch.max(student_logits[labeled_mask], 1)
            correct += (preds == labels[labeled_mask]).sum().item()
            total += labeled_mask.sum().item()


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total if total > 0 else 0.0
    return epoch_loss, epoch_acc

In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc

In [None]:
optimizer_student = torch.optim.Adam(student.parameters(), lr=3e-4) # Define optimizer for student
T = 5.0 # Temperature for KL divergence
alpha = 0.7
# Move student model to device
student = student.to(device)

for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+1}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 1: Train Loss (Student): 5.1931 Acc (Labeled): 0.5214 | Val Loss (Student): 0.7039 Acc: 0.5000
Epoch 2: Train Loss (Student): 5.1687 Acc (Labeled): 0.5714 | Val Loss (Student): 0.7169 Acc: 0.5778
Epoch 3: Train Loss (Student): 5.0559 Acc (Labeled): 0.5857 | Val Loss (Student): 0.8445 Acc: 0.5167
Epoch 4: Train Loss (Student): 4.8943 Acc (Labeled): 0.5833 | Val Loss (Student): 2.1410 Acc: 0.5222
Epoch 5: Train Loss (Student): 4.7934 Acc (Labeled): 0.6381 | Val Loss (Student): 0.7734 Acc: 0.6167
Epoch 6: Train Loss (Student): 4.7095 Acc (Labeled): 0.6286 | Val Loss (Student): 1.2636 Acc: 0.5889
Epoch 7: Train Loss (Student): 4.6460 Acc (Labeled): 0.6357 | Val Loss (Student): 0.7221 Acc: 0.7167
Epoch 8: Train Loss (Student): 4.5148 Acc (Labeled): 0.6095 | Val Loss (Student): 0.7326 Acc: 0.7111
Epoch 9: Train Loss (Student): 4.5226 Acc (Labeled): 0.6500 | Val Loss (Student): 0.9390 Acc: 0.6444
Epoch 10: Train Loss (Student): 4.4434 Acc (Labeled): 0.6810 | Val Loss (Student): 0.7981 A

In [None]:
torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_3000.pth')

In [None]:
for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+11}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 11: Train Loss (Student): 4.3120 Acc (Labeled): 0.6619 | Val Loss (Student): 0.7002 Acc: 0.7389
Epoch 12: Train Loss (Student): 4.2053 Acc (Labeled): 0.6667 | Val Loss (Student): 0.6846 Acc: 0.7389
Epoch 13: Train Loss (Student): 4.1170 Acc (Labeled): 0.6881 | Val Loss (Student): 0.6656 Acc: 0.7556
Epoch 14: Train Loss (Student): 4.1401 Acc (Labeled): 0.6833 | Val Loss (Student): 0.7854 Acc: 0.7222
Epoch 15: Train Loss (Student): 3.9562 Acc (Labeled): 0.7143 | Val Loss (Student): 0.9036 Acc: 0.7500
Epoch 16: Train Loss (Student): 3.9139 Acc (Labeled): 0.7000 | Val Loss (Student): 0.9829 Acc: 0.7056
Epoch 17: Train Loss (Student): 3.9514 Acc (Labeled): 0.6786 | Val Loss (Student): 0.6888 Acc: 0.7556
Epoch 18: Train Loss (Student): 3.8844 Acc (Labeled): 0.6881 | Val Loss (Student): 2.1505 Acc: 0.5944
Epoch 19: Train Loss (Student): 3.8501 Acc (Labeled): 0.7476 | Val Loss (Student): 0.8681 Acc: 0.7389
Epoch 20: Train Loss (Student): 3.7870 Acc (Labeled): 0.7333 | Val Loss (Student):

In [None]:
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+21}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

Epoch 21: Train Loss (Student): 3.7003 Acc (Labeled): 0.7333 | Val Loss (Student): 0.8330 Acc: 0.8000
Epoch 22: Train Loss (Student): 3.6804 Acc (Labeled): 0.7119 | Val Loss (Student): 0.6037 Acc: 0.8000
Epoch 23: Train Loss (Student): 3.6028 Acc (Labeled): 0.7190 | Val Loss (Student): 0.5855 Acc: 0.8167
Epoch 24: Train Loss (Student): 3.6279 Acc (Labeled): 0.7667 | Val Loss (Student): 0.5893 Acc: 0.8056
Epoch 25: Train Loss (Student): 3.5635 Acc (Labeled): 0.7452 | Val Loss (Student): 0.7243 Acc: 0.7889


#6000SET

##Distillation

In [None]:
import torchvision.models as models
import torch.nn as nn

# Instantiate MobileNetV2 without pretrained weights
student = models.mobilenet_v2(weights=None)

# Replace the default classifier with a new linear layer for 2 classes
# The last_channel attribute gives the input features to the original classifier
num_classes = 2
student.classifier[1] = nn.Linear(student.last_channel, num_classes)

print("MobileNetV2 student model defined with classification head.")

MobileNetV2 student model defined with classification head.


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch

# Define the path to your saved finetuned teacher model checkpoint
finetuned_checkpoint_path = save_path

# Load a standard ResNet50 model structure
teacher_model = models.resnet50(weights=None) # Load without pretrained ImageNet weights initially

# Modify the final fully connected layer to match the number of classes
num_ftrs = teacher_model.fc.in_features
num_classes = 2  # Your model was finetuned for 2 classes (Cat/Dog)
teacher_model.fc = nn.Linear(num_ftrs, num_classes)


# Load the state dictionary from the saved finetuned teacher model checkpoint
# Using map_location='cpu' to load onto CPU first is safer, then move to device
teacher_state_dict = torch.load(finetuned_checkpoint_path, map_location='cpu')

# Load the state dictionary into the standard ResNet50 model
# This should now work because the model structure matches the saved state_dict
teacher_model.load_state_dict(teacher_state_dict)

# Set the teacher model to evaluation mode
teacher_model.eval()

# Freeze the teacher model parameters
for param in teacher_model.parameters():
    param.requires_grad = False

# Determine the device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the teacher model to the device
teacher_model = teacher_model.to(device)

print("Finetuned teacher model loaded for distillation.")

Finetuned teacher model loaded for distillation.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- transforms ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- datasets ---
unlabeled = '/content/drive/MyDrive/pets/train6000'
#labeled = '/content/drive/MyDrive/pets/finetune_train'
#val = '/content/drive/MyDrive/pets/val'

# Load datasets
#labeled_dataset = datasets.ImageFolder(labeled, transform=train_transform)
unlabeled_dataset = datasets.ImageFolder(unlabeled, transform=train_transform)


# Replace labels for unlabeled samples with -1
unlabeled_dataset.samples = [(path, -1) for (path, _) in unlabeled_dataset.samples]

BATCH_SIZE = 64

# Combine
combined_dataset = ConcatDataset([labeled_dataset, unlabeled_dataset])
train_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

#val_dataset = datasets.ImageFolder(val, transform=val_transform)
#val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of images in the labeled dataset: {len(labeled_dataset)}")
print(f"Number of images in the unlabeled dataset: {len(unlabeled_dataset)}")
print(f"Number of images in the val dataset: {len(val_dataset)}")

Number of images in the labeled dataset: 420
Number of images in the unlabeled dataset: 6000
Number of images in the val dataset: 180


In [None]:
ce_loss = nn.CrossEntropyLoss()

def kd_loss(student_logits, teacher_logits, T):
    """KL divergence loss for soft logits."""
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

In [None]:
def train_distillation_epoch(student_model, teacher_model, dataloader, criterion_ce, criterion_kd, optimizer, T, device, alpha):
    student_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Get teacher and student outputs
        with torch.no_grad():
            teacher_logits = teacher_model(inputs)
        student_logits = student_model(inputs)

        # Identify labeled and unlabeled samples
        labeled_mask = (labels != -1)
        unlabeled_mask = (labels == -1)

        # Calculate loss for labeled data (Cross-Entropy)
        ce_loss = criterion_ce(student_logits[labeled_mask], labels[labeled_mask]) if labeled_mask.sum() > 0 else 0

        # Calculate loss for unlabeled data (KL Divergence)
        kd_loss_val = criterion_kd(student_logits[unlabeled_mask], teacher_logits[unlabeled_mask], T) if unlabeled_mask.sum() > 0 else 0

        # Combine losses
        loss = (1 - alpha) * ce_loss + alpha * kd_loss_val

        # Backpropagate and optimize
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * inputs.size(0)
        # For accuracy, only consider labeled data
        if labeled_mask.sum() > 0:
            _, preds = torch.max(student_logits[labeled_mask], 1)
            correct += (preds == labels[labeled_mask]).sum().item()
            total += labeled_mask.sum().item()


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total if total > 0 else 0.0
    return epoch_loss, epoch_acc

In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc

In [None]:
optimizer_student = torch.optim.Adam(student.parameters(), lr=3e-4) # Define optimizer for student
T = 5.0 # Temperature for KL divergence
alpha = 0.7
# Move student model to device
student = student.to(device)

for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+1}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 1: Train Loss (Student): 5.1842 Acc (Labeled): 0.5381 | Val Loss (Student): 0.6721 Acc: 0.5556
Epoch 2: Train Loss (Student): 5.0698 Acc (Labeled): 0.5452 | Val Loss (Student): 1.4788 Acc: 0.5167
Epoch 3: Train Loss (Student): 4.8110 Acc (Labeled): 0.6000 | Val Loss (Student): 0.6650 Acc: 0.6667
Epoch 4: Train Loss (Student): 4.6098 Acc (Labeled): 0.6238 | Val Loss (Student): 0.8363 Acc: 0.6889
Epoch 5: Train Loss (Student): 4.4442 Acc (Labeled): 0.6738 | Val Loss (Student): 0.9813 Acc: 0.7056
Epoch 6: Train Loss (Student): 4.3574 Acc (Labeled): 0.6619 | Val Loss (Student): 1.0877 Acc: 0.7278
Epoch 7: Train Loss (Student): 4.1749 Acc (Labeled): 0.7167 | Val Loss (Student): 0.8840 Acc: 0.7222
Epoch 8: Train Loss (Student): 4.0479 Acc (Labeled): 0.7071 | Val Loss (Student): 0.6472 Acc: 0.7500
Epoch 9: Train Loss (Student): 4.0704 Acc (Labeled): 0.7095 | Val Loss (Student): 0.6726 Acc: 0.7833
Epoch 10: Train Loss (Student): 3.9455 Acc (Labeled): 0.7452 | Val Loss (Student): 0.8921 A

In [None]:
for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+11}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 11: Train Loss (Student): 3.8369 Acc (Labeled): 0.7357 | Val Loss (Student): 1.0122 Acc: 0.7278
Epoch 12: Train Loss (Student): 3.7176 Acc (Labeled): 0.7214 | Val Loss (Student): 0.6087 Acc: 0.7944
Epoch 13: Train Loss (Student): 3.7597 Acc (Labeled): 0.7286 | Val Loss (Student): 0.7711 Acc: 0.7444
Epoch 14: Train Loss (Student): 3.6544 Acc (Labeled): 0.7738 | Val Loss (Student): 0.6089 Acc: 0.7889
Epoch 15: Train Loss (Student): 3.4878 Acc (Labeled): 0.7500 | Val Loss (Student): 0.7014 Acc: 0.7667
Epoch 16: Train Loss (Student): 3.6015 Acc (Labeled): 0.7524 | Val Loss (Student): 0.9612 Acc: 0.7889
Epoch 17: Train Loss (Student): 3.4703 Acc (Labeled): 0.7190 | Val Loss (Student): 0.6568 Acc: 0.8111
Epoch 18: Train Loss (Student): 3.3336 Acc (Labeled): 0.7571 | Val Loss (Student): 0.8136 Acc: 0.7833
Epoch 19: Train Loss (Student): 3.2784 Acc (Labeled): 0.7690 | Val Loss (Student): 0.6788 Acc: 0.8500
Epoch 20: Train Loss (Student): 3.1939 Acc (Labeled): 0.7548 | Val Loss (Student):

In [None]:
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+21}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

Epoch 21: Train Loss (Student): 3.1795 Acc (Labeled): 0.7571 | Val Loss (Student): 0.6118 Acc: 0.8389
Epoch 22: Train Loss (Student): 3.1541 Acc (Labeled): 0.8048 | Val Loss (Student): 0.4959 Acc: 0.8444
Epoch 23: Train Loss (Student): 2.9786 Acc (Labeled): 0.7857 | Val Loss (Student): 0.4949 Acc: 0.8444
Epoch 24: Train Loss (Student): 3.0608 Acc (Labeled): 0.8000 | Val Loss (Student): 0.4247 Acc: 0.8667
Epoch 25: Train Loss (Student): 3.0114 Acc (Labeled): 0.8143 | Val Loss (Student): 0.5318 Acc: 0.8778


In [None]:
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+26}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

Epoch 26: Train Loss (Student): 2.8043 Acc (Labeled): 0.7976 | Val Loss (Student): 0.6829 Acc: 0.8389
Epoch 27: Train Loss (Student): 2.7596 Acc (Labeled): 0.8143 | Val Loss (Student): 0.5620 Acc: 0.8556
Epoch 28: Train Loss (Student): 2.6699 Acc (Labeled): 0.8238 | Val Loss (Student): 0.4682 Acc: 0.8833
Epoch 29: Train Loss (Student): 2.5936 Acc (Labeled): 0.8214 | Val Loss (Student): 0.4698 Acc: 0.8667
Epoch 30: Train Loss (Student): 2.5022 Acc (Labeled): 0.8238 | Val Loss (Student): 0.3828 Acc: 0.8667


In [None]:
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+31}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

Epoch 31: Train Loss (Student): 2.4667 Acc (Labeled): 0.8143 | Val Loss (Student): 0.4770 Acc: 0.8722
Epoch 32: Train Loss (Student): 2.4439 Acc (Labeled): 0.8429 | Val Loss (Student): 0.3129 Acc: 0.9111
Epoch 33: Train Loss (Student): 2.3199 Acc (Labeled): 0.8190 | Val Loss (Student): 0.3351 Acc: 0.8889
Epoch 34: Train Loss (Student): 2.3658 Acc (Labeled): 0.8595 | Val Loss (Student): 0.3397 Acc: 0.9000
Epoch 35: Train Loss (Student): 2.2309 Acc (Labeled): 0.8357 | Val Loss (Student): 0.4302 Acc: 0.8889


In [None]:
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+36}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

In [None]:
torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_6000.pth')

#10 000SET

##Distillation

In [None]:
import torchvision.models as models
import torch.nn as nn

# Instantiate MobileNetV2 without pretrained weights
student = models.mobilenet_v2(weights=None)

# Replace the default classifier with a new linear layer for 2 classes
# The last_channel attribute gives the input features to the original classifier
num_classes = 2
student.classifier[1] = nn.Linear(student.last_channel, num_classes)

print("MobileNetV2 student model defined with classification head.")

MobileNetV2 student model defined with classification head.


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch

# Define the path to your saved finetuned teacher model checkpoint
finetuned_checkpoint_path = '/content/drive/MyDrive/mods/resnet_finetune_only.pth'

# Load a standard ResNet50 model structure
teacher_model = models.resnet50(weights=None) # Load without pretrained ImageNet weights initially

# Modify the final fully connected layer to match the number of classes
num_ftrs = teacher_model.fc.in_features
num_classes = 2  # Your model was finetuned for 2 classes (Cat/Dog)
teacher_model.fc = nn.Linear(num_ftrs, num_classes)


# Load the state dictionary from the saved finetuned teacher model checkpoint
# Using map_location='cpu' to load onto CPU first is safer, then move to device
teacher_state_dict = torch.load(finetuned_checkpoint_path, map_location='cpu')

# Load the state dictionary into the standard ResNet50 model
# This should now work because the model structure matches the saved state_dict
teacher_model.load_state_dict(teacher_state_dict)

# Set the teacher model to evaluation mode
teacher_model.eval()

# Freeze the teacher model parameters
for param in teacher_model.parameters():
    param.requires_grad = False

# Determine the device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the teacher model to the device
teacher_model = teacher_model.to(device)

print("Finetuned teacher model loaded for distillation.")

Finetuned teacher model loaded for distillation.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- transforms ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- datasets ---
unlabeled = '/content/drive/MyDrive/pets/unlabeled_train'
labeled = '/content/drive/MyDrive/pets/finetune_train'
val = '/content/drive/MyDrive/pets/val'

# Load datasets
labeled_dataset = datasets.ImageFolder(labeled, transform=train_transform)
unlabeled_dataset = datasets.ImageFolder(unlabeled, transform=train_transform)


# Replace labels for unlabeled samples with -1
unlabeled_dataset.samples = [(path, -1) for (path, _) in unlabeled_dataset.samples]

BATCH_SIZE = 64

# Combine
combined_dataset = ConcatDataset([labeled_dataset, unlabeled_dataset])
train_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

val_dataset = datasets.ImageFolder(val, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of images in the labeled dataset: {len(labeled_dataset)}")
print(f"Number of images in the unlabeled dataset: {len(unlabeled_dataset)}")
print(f"Number of images in the val dataset: {len(val_dataset)}")

Number of images in the labeled dataset: 420
Number of images in the unlabeled dataset: 10000
Number of images in the val dataset: 180


In [None]:
ce_loss = nn.CrossEntropyLoss()

def kd_loss(student_logits, teacher_logits, T):
    """KL divergence loss for soft logits."""
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

In [None]:
def train_distillation_epoch(student_model, teacher_model, dataloader, criterion_ce, criterion_kd, optimizer, T, device, alpha):
    student_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Get teacher and student outputs
        with torch.no_grad():
            teacher_logits = teacher_model(inputs)
        student_logits = student_model(inputs)

        # Identify labeled and unlabeled samples
        labeled_mask = (labels != -1)
        unlabeled_mask = (labels == -1)

        # Calculate loss for labeled data (Cross-Entropy)
        ce_loss = criterion_ce(student_logits[labeled_mask], labels[labeled_mask]) if labeled_mask.sum() > 0 else 0

        # Calculate loss for unlabeled data (KL Divergence)
        kd_loss_val = criterion_kd(student_logits[unlabeled_mask], teacher_logits[unlabeled_mask], T) if unlabeled_mask.sum() > 0 else 0

        # Combine losses
        loss = (1 - alpha) * ce_loss + alpha * kd_loss_val

        # Backpropagate and optimize
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * inputs.size(0)
        # For accuracy, only consider labeled data
        if labeled_mask.sum() > 0:
            _, preds = torch.max(student_logits[labeled_mask], 1)
            correct += (preds == labels[labeled_mask]).sum().item()
            total += labeled_mask.sum().item()


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total if total > 0 else 0.0
    return epoch_loss, epoch_acc

In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc

In [None]:
optimizer_student = torch.optim.Adam(student.parameters(), lr=3e-4) # Define optimizer for student
T = 5.0 # Temperature for KL divergence
alpha = 0.8
# Move student model to device
student = student.to(device)

for epoch in range(15):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+1}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 1: Train Loss (Student): 5.6798 Acc (Labeled): 0.5738 | Val Loss (Student): 0.8933 Acc: 0.6111
Epoch 2: Train Loss (Student): 5.3400 Acc (Labeled): 0.6095 | Val Loss (Student): 1.3802 Acc: 0.6000
Epoch 3: Train Loss (Student): 5.0225 Acc (Labeled): 0.6405 | Val Loss (Student): 0.7372 Acc: 0.7333
Epoch 4: Train Loss (Student): 4.8159 Acc (Labeled): 0.6929 | Val Loss (Student): 0.5996 Acc: 0.7111
Epoch 5: Train Loss (Student): 4.6569 Acc (Labeled): 0.6857 | Val Loss (Student): 0.7013 Acc: 0.7444
Epoch 6: Train Loss (Student): 4.4295 Acc (Labeled): 0.6857 | Val Loss (Student): 0.6367 Acc: 0.7333
Epoch 7: Train Loss (Student): 4.3009 Acc (Labeled): 0.7238 | Val Loss (Student): 0.6250 Acc: 0.7833
Epoch 8: Train Loss (Student): 4.1574 Acc (Labeled): 0.7429 | Val Loss (Student): 0.8455 Acc: 0.7667
Epoch 9: Train Loss (Student): 4.0118 Acc (Labeled): 0.7310 | Val Loss (Student): 0.5925 Acc: 0.8222
Epoch 10: Train Loss (Student): 3.9256 Acc (Labeled): 0.7524 | Val Loss (Student): 0.6006 A

##Distillation (2)

In [None]:
import torch
import torchvision.models as models
import torch.nn as nn

# Instantiate MobileNetV2 without pretrained weights
student = models.mobilenet_v2(weights=None)

# Replace the default classifier with a new linear layer for 2 classes
# The last_channel attribute gives the input features to the original classifier
num_classes = 2
student.classifier[1] = nn.Linear(student.last_channel, num_classes)

student_state_dict = torch.load('/content/drive/MyDrive/mods/only_distilled_student_10000.pth', map_location='cpu')

student.load_state_dict(student_state_dict, strict=True)

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
student = student.to(device)
print("MobileNetV2 student model defined with classification head.")

MobileNetV2 student model defined with classification head.


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch

# Define the path to your saved finetuned teacher model checkpoint
finetuned_checkpoint_path = '/content/drive/MyDrive/mods/resnet_finetune_only.pth'

# Load a standard ResNet50 model structure
teacher_model = models.resnet50(weights=None) # Load without pretrained ImageNet weights initially

# Modify the final fully connected layer to match the number of classes
num_ftrs = teacher_model.fc.in_features
num_classes = 2  # Your model was finetuned for 2 classes (Cat/Dog)
teacher_model.fc = nn.Linear(num_ftrs, num_classes)


# Load the state dictionary from the saved finetuned teacher model checkpoint
# Using map_location='cpu' to load onto CPU first is safer, then move to device
teacher_state_dict = torch.load(finetuned_checkpoint_path, map_location='cpu')

# Load the state dictionary into the standard ResNet50 model
# This should now work because the model structure matches the saved state_dict
teacher_model.load_state_dict(teacher_state_dict)

# Set the teacher model to evaluation mode
teacher_model.eval()

# Freeze the teacher model parameters
for param in teacher_model.parameters():
    param.requires_grad = False

# Determine the device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the teacher model to the device
teacher_model = teacher_model.to(device)

print("Finetuned teacher model loaded for distillation.")

Finetuned teacher model loaded for distillation.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- transforms ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- datasets ---
unlabeled = '/content/drive/MyDrive/pets/unlabeled_train'
labeled = '/content/drive/MyDrive/pets/finetune_train'
val = '/content/drive/MyDrive/pets/val'

# Load datasets
labeled_dataset = datasets.ImageFolder(labeled, transform=train_transform)
unlabeled_dataset = datasets.ImageFolder(unlabeled, transform=train_transform)


# Replace labels for unlabeled samples with -1
unlabeled_dataset.samples = [(path, -1) for (path, _) in unlabeled_dataset.samples]

BATCH_SIZE = 64

# Combine
combined_dataset = ConcatDataset([labeled_dataset, unlabeled_dataset])
train_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

val_dataset = datasets.ImageFolder(val, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of images in the labeled dataset: {len(labeled_dataset)}")
print(f"Number of images in the unlabeled dataset: {len(unlabeled_dataset)}")
print(f"Number of images in the val dataset: {len(val_dataset)}")

Number of images in the labeled dataset: 420
Number of images in the unlabeled dataset: 10000
Number of images in the val dataset: 180


In [None]:
ce_loss = nn.CrossEntropyLoss()

def kd_loss(student_logits, teacher_logits, T):
    """KL divergence loss for soft logits."""
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

In [None]:
def train_distillation_epoch(student_model, teacher_model, dataloader, criterion_ce, criterion_kd, optimizer, T, device, alpha):
    student_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Get teacher and student outputs
        with torch.no_grad():
            teacher_logits = teacher_model(inputs)
        student_logits = student_model(inputs)

        # Identify labeled and unlabeled samples
        labeled_mask = (labels != -1)
        unlabeled_mask = (labels == -1)

        # Calculate loss for labeled data (Cross-Entropy)
        ce_loss = criterion_ce(student_logits[labeled_mask], labels[labeled_mask]) if labeled_mask.sum() > 0 else 0

        # Calculate loss for unlabeled data (KL Divergence)
        kd_loss_val = criterion_kd(student_logits[unlabeled_mask], teacher_logits[unlabeled_mask], T) if unlabeled_mask.sum() > 0 else 0

        # Combine losses
        loss = (1 - alpha) * ce_loss + alpha * kd_loss_val

        # Backpropagate and optimize
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * inputs.size(0)
        # For accuracy, only consider labeled data
        if labeled_mask.sum() > 0:
            _, preds = torch.max(student_logits[labeled_mask], 1)
            correct += (preds == labels[labeled_mask]).sum().item()
            total += labeled_mask.sum().item()


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total if total > 0 else 0.0
    return epoch_loss, epoch_acc

In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc

In [None]:
optimizer_student = torch.optim.Adam(student.parameters(), lr=3e-4) # Define optimizer for student
T = 5.0 # Temperature for KL divergence
alpha = 0.8
# Move student model to device
#student = student.to(device)

for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+16}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 16: Train Loss (Student): 3.1788 Acc (Labeled): 0.7762 | Val Loss (Student): 0.4500 Acc: 0.8444
Epoch 17: Train Loss (Student): 3.0052 Acc (Labeled): 0.7952 | Val Loss (Student): 0.4444 Acc: 0.8611
Epoch 18: Train Loss (Student): 2.8918 Acc (Labeled): 0.7976 | Val Loss (Student): 0.4829 Acc: 0.8722
Epoch 19: Train Loss (Student): 2.7401 Acc (Labeled): 0.8238 | Val Loss (Student): 0.3349 Acc: 0.8944
Epoch 20: Train Loss (Student): 2.6777 Acc (Labeled): 0.8143 | Val Loss (Student): 0.3769 Acc: 0.8778
Epoch 21: Train Loss (Student): 2.4268 Acc (Labeled): 0.8357 | Val Loss (Student): 0.3770 Acc: 0.8833
Epoch 22: Train Loss (Student): 2.4466 Acc (Labeled): 0.8357 | Val Loss (Student): 0.5768 Acc: 0.8500
Epoch 23: Train Loss (Student): 2.2970 Acc (Labeled): 0.8571 | Val Loss (Student): 0.4746 Acc: 0.8833
Epoch 24: Train Loss (Student): 2.2736 Acc (Labeled): 0.8381 | Val Loss (Student): 0.3630 Acc: 0.9056
Epoch 25: Train Loss (Student): 2.1969 Acc (Labeled): 0.8500 | Val Loss (Student):

In [None]:
torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_cont.pth')

##Distillation (3)

In [None]:
import torch
import torchvision.models as models
import torch.nn as nn

# Instantiate MobileNetV2 without pretrained weights
student = models.mobilenet_v2(weights=None)

# Replace the default classifier with a new linear layer for 2 classes
# The last_channel attribute gives the input features to the original classifier
num_classes = 2
student.classifier[1] = nn.Linear(student.last_channel, num_classes)

student_state_dict = torch.load('/content/drive/MyDrive/mods/only_distilled_student_10000_cont.pth', map_location='cpu')

student.load_state_dict(student_state_dict, strict=True)

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
student = student.to(device)
print("MobileNetV2 student model defined with classification head.")

MobileNetV2 student model defined with classification head.


In [None]:
import torchvision.models as models
import torch.nn as nn
import torch

# Define the path to your saved finetuned teacher model checkpoint
finetuned_checkpoint_path = '/content/drive/MyDrive/mods/resnet_finetune_only.pth'

# Load a standard ResNet50 model structure
teacher_model = models.resnet50(weights=None) # Load without pretrained ImageNet weights initially

# Modify the final fully connected layer to match the number of classes
num_ftrs = teacher_model.fc.in_features
num_classes = 2  # Your model was finetuned for 2 classes (Cat/Dog)
teacher_model.fc = nn.Linear(num_ftrs, num_classes)


# Load the state dictionary from the saved finetuned teacher model checkpoint
# Using map_location='cpu' to load onto CPU first is safer, then move to device
teacher_state_dict = torch.load(finetuned_checkpoint_path, map_location='cpu')

# Load the state dictionary into the standard ResNet50 model
# This should now work because the model structure matches the saved state_dict
teacher_model.load_state_dict(teacher_state_dict)

# Set the teacher model to evaluation mode
teacher_model.eval()

# Freeze the teacher model parameters
for param in teacher_model.parameters():
    param.requires_grad = False

# Determine the device based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the teacher model to the device
teacher_model = teacher_model.to(device)

print("Finetuned teacher model loaded for distillation.")

Finetuned teacher model loaded for distillation.


In [None]:
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- transforms ---
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])
val_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# --- datasets ---
unlabeled = '/content/drive/MyDrive/pets/unlabeled_train'
labeled = '/content/drive/MyDrive/pets/finetune_train'
val = '/content/drive/MyDrive/pets/val'

# Load datasets
labeled_dataset = datasets.ImageFolder(labeled, transform=train_transform)
unlabeled_dataset = datasets.ImageFolder(unlabeled, transform=train_transform)


# Replace labels for unlabeled samples with -1
unlabeled_dataset.samples = [(path, -1) for (path, _) in unlabeled_dataset.samples]

BATCH_SIZE = 64

# Combine
combined_dataset = ConcatDataset([labeled_dataset, unlabeled_dataset])
train_loader = DataLoader(combined_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

val_dataset = datasets.ImageFolder(val, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Number of images in the labeled dataset: {len(labeled_dataset)}")
print(f"Number of images in the unlabeled dataset: {len(unlabeled_dataset)}")
print(f"Number of images in the val dataset: {len(val_dataset)}")

Number of images in the labeled dataset: 420
Number of images in the unlabeled dataset: 10000
Number of images in the val dataset: 180


In [None]:
ce_loss = nn.CrossEntropyLoss()

def kd_loss(student_logits, teacher_logits, T):
    """KL divergence loss for soft logits."""
    p_s = F.log_softmax(student_logits / T, dim=1)
    p_t = F.softmax(teacher_logits / T, dim=1)
    return F.kl_div(p_s, p_t, reduction='batchmean') * (T * T)

In [None]:
def train_distillation_epoch(student_model, teacher_model, dataloader, criterion_ce, criterion_kd, optimizer, T, device, alpha):
    student_model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Get teacher and student outputs
        with torch.no_grad():
            teacher_logits = teacher_model(inputs)
        student_logits = student_model(inputs)

        # Identify labeled and unlabeled samples
        labeled_mask = (labels != -1)
        unlabeled_mask = (labels == -1)

        # Calculate loss for labeled data (Cross-Entropy)
        ce_loss = criterion_ce(student_logits[labeled_mask], labels[labeled_mask]) if labeled_mask.sum() > 0 else 0

        # Calculate loss for unlabeled data (KL Divergence)
        kd_loss_val = criterion_kd(student_logits[unlabeled_mask], teacher_logits[unlabeled_mask], T) if unlabeled_mask.sum() > 0 else 0

        # Combine losses
        loss = (1 - alpha) * ce_loss + alpha * kd_loss_val

        # Backpropagate and optimize
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * inputs.size(0)
        # For accuracy, only consider labeled data
        if labeled_mask.sum() > 0:
            _, preds = torch.max(student_logits[labeled_mask], 1)
            correct += (preds == labels[labeled_mask]).sum().item()
            total += labeled_mask.sum().item()


    epoch_loss = running_loss / len(dataloader.dataset)
    epoch_acc = correct / total if total > 0 else 0.0
    return epoch_loss, epoch_acc

In [None]:
def validate(model, dataloader, criterion):
    model.eval()
    running_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_loss = running_loss / total
    val_acc = correct / total
    return val_loss, val_acc

In [None]:
optimizer_student = torch.optim.Adam(student.parameters(), lr=3e-4) # Define optimizer for student
T = 5.0 # Temperature for KL divergence
alpha = 0.8
# Move student model to device
#student = student.to(device)

for epoch in range(10):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    print(f'Epoch {epoch+26}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')

    if (epoch+1)%5 == 0:
        torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_epoch.pth')


Epoch 26: Train Loss (Student): 2.1166 Acc (Labeled): 0.8429 | Val Loss (Student): 0.2889 Acc: 0.9389
Epoch 27: Train Loss (Student): 2.0745 Acc (Labeled): 0.8595 | Val Loss (Student): 0.3250 Acc: 0.9111
Epoch 28: Train Loss (Student): 1.9576 Acc (Labeled): 0.8643 | Val Loss (Student): 0.3833 Acc: 0.9000
Epoch 29: Train Loss (Student): 1.8979 Acc (Labeled): 0.8643 | Val Loss (Student): 0.2492 Acc: 0.9167
Epoch 30: Train Loss (Student): 1.8586 Acc (Labeled): 0.8738 | Val Loss (Student): 0.2342 Acc: 0.9333
Epoch 31: Train Loss (Student): 1.7760 Acc (Labeled): 0.8571 | Val Loss (Student): 0.2642 Acc: 0.9333
Epoch 32: Train Loss (Student): 1.7745 Acc (Labeled): 0.8571 | Val Loss (Student): 0.2064 Acc: 0.9444
Epoch 33: Train Loss (Student): 1.7592 Acc (Labeled): 0.8595 | Val Loss (Student): 0.1629 Acc: 0.9444
Epoch 34: Train Loss (Student): 1.7221 Acc (Labeled): 0.8714 | Val Loss (Student): 0.1835 Acc: 0.9444
Epoch 35: Train Loss (Student): 1.6156 Acc (Labeled): 0.8714 | Val Loss (Student):

In [None]:
torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_cont.pth')

In [None]:
for epoch in range(8):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    if (epoch+1)%2 == 0:
        torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_epoch.pth')

    print(f'Epoch {epoch+36}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 36: Train Loss (Student): 1.5906 Acc (Labeled): 0.8976 | Val Loss (Student): 0.1992 Acc: 0.9389
Epoch 37: Train Loss (Student): 1.5353 Acc (Labeled): 0.8690 | Val Loss (Student): 0.3305 Acc: 0.9222
Epoch 38: Train Loss (Student): 1.4961 Acc (Labeled): 0.8881 | Val Loss (Student): 0.2045 Acc: 0.9500
Epoch 39: Train Loss (Student): 1.4350 Acc (Labeled): 0.9000 | Val Loss (Student): 0.1600 Acc: 0.9500
Epoch 40: Train Loss (Student): 1.4505 Acc (Labeled): 0.8952 | Val Loss (Student): 0.1887 Acc: 0.9389
Epoch 41: Train Loss (Student): 1.4318 Acc (Labeled): 0.8929 | Val Loss (Student): 0.1660 Acc: 0.9611
Epoch 42: Train Loss (Student): 1.3724 Acc (Labeled): 0.8738 | Val Loss (Student): 0.2022 Acc: 0.9389
Epoch 43: Train Loss (Student): 1.3654 Acc (Labeled): 0.9167 | Val Loss (Student): 0.2422 Acc: 0.9389


In [None]:
valloss = 0.94
for epoch in range(5):
    train_loss_student, train_acc_student = train_distillation_epoch(
        student, teacher_model, train_loader, ce_loss, kd_loss, optimizer_student, T, device, alpha
    )
    val_loss_student, val_acc_student = validate(student, val_loader, ce_loss)

    if (epoch+1)%2 == 0:
        torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_epoch.pth')
    if val_loss_student < valloss:
        valloss = val_loss_student
        torch.save(student.state_dict(), '/content/drive/MyDrive/mods/only_distilled_student_10000_cont.pth')

    print(f'Epoch {epoch+44}: '
          f'Train Loss (Student): {train_loss_student:.4f} Acc (Labeled): {train_acc_student:.4f} | '
          f'Val Loss (Student): {val_loss_student:.4f} Acc: {val_acc_student:.4f}')



Epoch 44: Train Loss (Student): 1.2921 Acc (Labeled): 0.8881 | Val Loss (Student): 0.2864 Acc: 0.9333
Epoch 45: Train Loss (Student): 1.3278 Acc (Labeled): 0.8952 | Val Loss (Student): 0.1995 Acc: 0.9389
Epoch 46: Train Loss (Student): 1.2949 Acc (Labeled): 0.9095 | Val Loss (Student): 0.1766 Acc: 0.9500
Epoch 47: Train Loss (Student): 1.2747 Acc (Labeled): 0.8905 | Val Loss (Student): 0.1520 Acc: 0.9500
Epoch 48: Train Loss (Student): 1.2877 Acc (Labeled): 0.9048 | Val Loss (Student): 0.2678 Acc: 0.9444
