In [1]:
import time
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
from torchvision import datasets, transforms
from torchsummary import summary
from torch.optim import lr_scheduler
import torch.nn.functional as F
import torch.nn as nn
import torchvision.models as models
from torch import nn, optim

In [2]:
transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])   #transforms.Normalize([0.485,0.456, 0.406], [0.229, 0.224, 0.225])
train_dataset = datasets.CIFAR10('/content/train/', download=True, train=True, transform=transform)
test_dataset = datasets.CIFAR10('/content/val/', download=True, train=False, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=True)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/train/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/train/cifar-10-python.tar.gz to /content/train/
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/val/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /content/val/cifar-10-python.tar.gz to /content/val/


## a

In [3]:
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
for param in resnet.parameters():
   param.requires_grad = False
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, 10)
resnet = resnet.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.fc.parameters())

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [4]:
resnet.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [5]:
total_params  = sum(p.numel() for p in resnet.parameters() if p.requires_grad)

In [6]:
for p in resnet.parameters():
  if p.requires_grad:
    print(p, p.numel())

Parameter containing:
tensor([[ 0.0139,  0.0218,  0.0190,  ..., -0.0205, -0.0140,  0.0125],
        [-0.0100,  0.0168, -0.0035,  ...,  0.0014, -0.0096, -0.0062],
        [-0.0100,  0.0194, -0.0203,  ...,  0.0020, -0.0113, -0.0056],
        ...,
        [ 0.0044,  0.0097,  0.0120,  ...,  0.0120, -0.0018, -0.0139],
        [ 0.0094, -0.0054,  0.0062,  ...,  0.0144,  0.0085,  0.0079],
        [-0.0075, -0.0014,  0.0126,  ...,  0.0207, -0.0220, -0.0104]],
       device='cuda:0', requires_grad=True) 20480
Parameter containing:
tensor([-0.0046, -0.0137, -0.0130,  0.0210,  0.0127, -0.0115,  0.0124,  0.0136,
        -0.0092,  0.0082], device='cuda:0', requires_grad=True) 10


In [7]:
def train_and_evaluate(model, train_loader, test_loader, criterion, optimizer, len_trainset, len_testset, num_epochs=25):
  model.train()
  best_model_wts = copy.deepcopy(model.state_dict())
  best_acc = 0.0
  for epoch in range(num_epochs):
      model.train()
      print('Epoch {}/{}'.format(epoch, num_epochs-1))
      print('-' * 10)
      running_loss = 0.0
      running_corrects = 0
      for inputs, labels in train_loader:
          inputs = inputs.to(device)
          labels = labels.to(device)
          optimizer.zero_grad()
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          _, preds = torch.max(outputs, 1)
          loss.backward() 
          optimizer.step()  
          running_loss += loss.item() * inputs.size(0)
          running_corrects += torch.sum(preds == labels.data)
      epoch_loss = running_loss / len_trainset
      epoch_acc = running_corrects.double() / len_trainset
      print(' Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss,
             epoch_acc)) 
         
      model.eval()
      running_loss_val = 0.0 
      running_corrects_val = 0
      for inputs, labels in test_loader:
          inputs = inputs.to(device)
          labels = labels.to(device)
          outputs = model(inputs) 
          loss = criterion(outputs,labels)
          _, preds = torch.max(outputs, 1)
          running_loss_val += loss.item() * inputs.size(0)
          running_corrects_val += torch.sum(preds == labels.data)
      
      epoch_loss_val = running_loss_val / len_testset
      epoch_acc_val = running_corrects_val.double() / len_testset
      
      if epoch_acc_val > best_acc:
          best_acc = epoch_acc_val
          best_model_wts = copy.deepcopy(model.state_dict())
      
      print(' Val Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss_val,
             epoch_acc_val))
      
      print()
      print('Best val Acc: {:4f}'.format(best_acc))
      model.load_state_dict(best_model_wts)
  return model

In [8]:
resnet_teacher = train_and_evaluate(resnet,train_loader,test_loader,criterion,optimizer, len(train_dataset),len(test_dataset), 10)

Epoch 0/9
----------
 Train Loss: 0.8405 Acc: 0.7362
 Val Loss: 0.6423 Acc: 0.7868

Best val Acc: 0.786800
Epoch 1/9
----------
 Train Loss: 0.6106 Acc: 0.7951
 Val Loss: 0.5804 Acc: 0.8038

Best val Acc: 0.803800
Epoch 2/9
----------
 Train Loss: 0.5534 Acc: 0.8124
 Val Loss: 0.5638 Acc: 0.8086

Best val Acc: 0.808600
Epoch 3/9
----------
 Train Loss: 0.5173 Acc: 0.8250
 Val Loss: 0.5689 Acc: 0.8051

Best val Acc: 0.808600
Epoch 4/9
----------
 Train Loss: 0.5217 Acc: 0.8247
 Val Loss: 0.5498 Acc: 0.8112

Best val Acc: 0.811200
Epoch 5/9
----------
 Train Loss: 0.4955 Acc: 0.8320
 Val Loss: 0.5559 Acc: 0.8110

Best val Acc: 0.811200
Epoch 6/9
----------
 Train Loss: 0.5003 Acc: 0.8306
 Val Loss: 0.5623 Acc: 0.8047

Best val Acc: 0.811200
Epoch 7/9
----------
 Train Loss: 0.4984 Acc: 0.8310
 Val Loss: 0.5574 Acc: 0.8112

Best val Acc: 0.811200
Epoch 8/9
----------
 Train Loss: 0.4958 Acc: 0.8314
 Val Loss: 0.5503 Acc: 0.8103

Best val Acc: 0.811200
Epoch 9/9
----------
 Train Loss: 0.4

## b

In [11]:
resnet18 = models.resnet18(pretrained=False)
num_ftrs = resnet18.fc.in_features
resnet18.fc = nn.Linear(num_ftrs, 10)
resnet18 = resnet18.to(device)

In [12]:
resnet18

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [14]:
def loss_kd(outputs, labels, teacher_outputs, temparature, alpha):
   KD_loss = nn.KLDivLoss()(F.log_softmax(outputs/temparature, dim=1),F.softmax(teacher_outputs/temparature,dim=1)) * \
   (alpha * temparature * temparature) + F.cross_entropy(outputs, labels) * (1. - alpha)
   return KD_loss
def get_outputs(model, dataloader):
   '''
   Used to get the output of the teacher network
   '''
   outputs = []
   for inputs, labels in dataloader:
      inputs_batch, labels_batch = inputs.cuda(), labels.cuda()
      output_batch = model(inputs_batch).data.cpu().numpy()
      outputs.append(output_batch)
   return outputs

In [15]:
def train_kd(model,teacher_out, optimizer, loss_kd, dataloader, temparature, alpha):
   model.train()
   running_loss = 0.0
   running_corrects = 0
   for i,(images, labels) in enumerate(dataloader):
      inputs = images.to(device)
      labels = labels.to(device)
      optimizer.zero_grad()
      outputs = model(inputs)
      outputs_teacher = torch.from_numpy(teacher_out[i]).to(device)
      loss = loss_kd(outputs,labels,outputs_teacher,temparature, 
                     alpha)
      _, preds = torch.max(outputs, 1)
      loss.backward()
      optimizer.step()
      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)
   
   epoch_loss = running_loss / len(train_dataset)
   epoch_acc = running_corrects.double() / len(train_dataset)
   print('Train Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
def eval_kd(model,teacher_out, optimizer, loss_kd, dataloader, temparature, alpha):
   model.eval()
   running_loss = 0.0
   running_corrects = 0
   for i,(images, labels) in enumerate(dataloader):
      inputs = images.to(device)
      labels = labels.to(device)
      outputs = model(inputs)
      outputs_teacher = torch.from_numpy(teacher_out[i]).cuda()
      loss = loss_kd(outputs,labels,outputs_teacher,temparature, 
                     alpha)
      _, preds = torch.max(outputs, 1)
      running_loss += loss.item() * inputs.size(0)
      running_corrects += torch.sum(preds == labels.data)
   epoch_loss = running_loss / len(test_dataset)
   epoch_acc = running_corrects.double() / len(test_dataset)
   print('Val Loss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))
   return epoch_acc
def train_and_evaluate_kd(model, teacher_model, optimizer, loss_kd, trainloader, valloader, temparature, alpha, num_epochs=25):
   teacher_model.eval()
   best_model_wts = copy.deepcopy(model.state_dict())
   outputs_teacher_train = get_outputs(teacher_model, trainloader)
   outputs_teacher_val = get_outputs(teacher_model, valloader)
   print('Teacher’s outputs are computed now starting the training process-')
   best_acc = 0.0
   for epoch in range(num_epochs):
      print('Epoch {}/{}'.format(epoch, num_epochs - 1))
      print('-' * 10)
      
      # Training the student with the soft labes as the outputs from the teacher and using the loss_kd function
      
      train_kd(model, outputs_teacher_train, 
               optim.Adam(resnet18.parameters()),loss_kd,trainloader, 
               temparature, alpha)
     
      # Evaluating the student network
      epoch_acc_val = eval_kd(model, outputs_teacher_val, 
                          optim.Adam(resnet18.parameters()), loss_kd, 
                          valloader, temparature, alpha)
      if epoch_acc_val > best_acc:
         best_acc = epoch_acc_val
         best_model_wts = copy.deepcopy(model.state_dict())
         print('Best val Acc: {:4f}'.format(best_acc))
         model.load_state_dict(best_model_wts)
   return model

In [17]:
student_model=train_and_evaluate_kd(resnet18, resnet_teacher, optim.Adam(resnet18.parameters()), loss_kd, train_loader, test_loader, 10, 0.1, 10)

Teacher’s outputs are computed now starting the training process-
Epoch 0/9
----------




Train Loss: 1.3399 Acc: 0.4872
Val Loss: 1.0725 Acc: 0.6158
Best val Acc: 0.615800
Epoch 1/9
----------
Train Loss: 0.8563 Acc: 0.7030
Val Loss: 0.9507 Acc: 0.6731
Best val Acc: 0.673100
Epoch 2/9
----------
Train Loss: 0.6558 Acc: 0.7857
Val Loss: 0.6844 Acc: 0.7767
Best val Acc: 0.776700
Epoch 3/9
----------
Train Loss: 0.5366 Acc: 0.8347
Val Loss: 0.6241 Acc: 0.8080
Best val Acc: 0.808000
Epoch 4/9
----------
Train Loss: 0.4425 Acc: 0.8714
Val Loss: 0.6146 Acc: 0.8137
Best val Acc: 0.813700
Epoch 5/9
----------
Train Loss: 0.3596 Acc: 0.9068
Val Loss: 0.5384 Acc: 0.8397
Best val Acc: 0.839700
Epoch 6/9
----------
Train Loss: 0.2881 Acc: 0.9357
Val Loss: 0.5790 Acc: 0.8349
Epoch 7/9
----------
Train Loss: 0.2352 Acc: 0.9586
Val Loss: 0.6362 Acc: 0.8215
Epoch 8/9
----------
Train Loss: 0.1970 Acc: 0.9736
Val Loss: 0.5887 Acc: 0.8317
Epoch 9/9
----------
Train Loss: 0.1750 Acc: 0.9818
Val Loss: 0.5142 Acc: 0.8558
Best val Acc: 0.855800


## c

In [21]:
resnet18 = models.resnet18(pretrained=False)
num_ftrs = resnet18.fc.in_features
resnet18.fc = nn.Linear(num_ftrs, 10)
resnet18 = resnet18.to(device)

In [23]:
trian_resnet18 = train_and_evaluate(resnet18,train_loader,test_loader,criterion,optimizer, len(train_dataset),len(test_dataset), 3)
# اجرای این کد بسیار زمان بر است و به همین دلیل به نتایج 3 ایپاک اکتفا کردم(البته اگر از حالت از قبل آموزش دیده ی ای شبکه برای شروع آموزش استفاده کنیم، نتایج بسیار بهتر خواهد شد. اما آموزش از صفر این شبکه قطعا نیاز به تعداد ایپاک بیشتر و اقدامات دیگری برای رسیدن به دقت مطلوب دارد.)
# in this case pretrained is False and accurecy is low.

Epoch 0/2
----------
 Train Loss: 2.3630 Acc: 0.1053
 Val Loss: 2.3661 Acc: 0.1051

Best val Acc: 0.105100
Epoch 1/2
----------
 Train Loss: 2.3635 Acc: 0.1053
 Val Loss: 2.3664 Acc: 0.1055

Best val Acc: 0.105500
Epoch 2/2
----------
 Train Loss: 2.3630 Acc: 0.1051
 Val Loss: 2.3623 Acc: 0.1057

Best val Acc: 0.105700


The accuracy is higher in the knowledge distillation method, because by using the knowledge distillation method, we managed to train a smaller network and benefit from the knowledge in the larger network (teacher) in this process. Knowledge distillation refers to the process of transferring knowledge from a large model or set of models to a smaller model that can be practically applied under real-world constraints. Distillation of knowledge is one of these techniques to transfer knowledge before the big one. Train models like ResNet, VGG, etc. to smaller networks. An "obvious" method, as mentioned in the paper Knowledge Distillation in Neural Networks by Geoffrey Hinton to transfer knowledge from a teacher-to-student model. The model uses "soft targets" for the learning process of the student model.

## d

In [25]:
resnet50_finetuning = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
num_ftrs = resnet50_finetuning.fc.in_features
resnet50_finetuning.fc = nn.Linear(num_ftrs, 10)
resnet50_finetuning = resnet50_finetuning.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet50_finetuning.fc.parameters())

In [26]:
train_resnet50_finetuning = train_and_evaluate(resnet50_finetuning,train_loader,test_loader,criterion,optimizer, len(train_dataset),len(test_dataset), 3)
# اجرای این کد بسیار زمان بر است و به همین دلیل به نتایج 3 ایپاک اکتفا کردم

Epoch 0/2
----------
 Train Loss: 0.8400 Acc: 0.7376
 Val Loss: 0.6409 Acc: 0.7891

Best val Acc: 0.789100
Epoch 1/2
----------
 Train Loss: 0.6057 Acc: 0.7975
 Val Loss: 0.5894 Acc: 0.8012

Best val Acc: 0.801200
Epoch 2/2
----------
 Train Loss: 0.5524 Acc: 0.8135
 Val Loss: 0.5625 Acc: 0.8108

Best val Acc: 0.810800


قطعا آموزش تمام مدل نتیجه(دقت) بهتری خواهد داشت(با شروع از حالت از قبل آموزش دیده) در مقایسه با حالتی که تمام شبکه به جز لایه آخر غیرقابل تغییر است و فقط یک لایه آموزش میبیند. البته میتوان گفت دقت در حالتی که تمام شبکه به جز لایه آخر غیرقابل آموزش هستند(حالت فریزکردن)، نیز مناسب و قابل قبول است و مزیت این روش نسبت به آموزش تمام شبکه صرفه جویی در محاسبات و کاهش زمان آموزش است.بنابراین در مدل های بزرگ که از چند مدل تشکیل شده است، میتوان از مدل های از قبل آموزش دیده و روش فریزکردن آن برای مقاصد مورد نظر بهره برد.