In [1]:
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision
import torch
from tqdm import tqdm 

In [61]:
BATCH_SIZE = 16
EPOCH_NUM = 10
LEARNING_RATE = 0.001

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

In [None]:
from torchvision.models import resnet50, ResNet50_Weights

resnet_50 = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
resnet_50.fc = nn.Sequential(
                              nn.Linear(2048, 256),
                              nn.ReLU(),
                              nn.Linear(256, 32),
                              nn.ReLU(),
                              nn.Linear(32, 10),
                              )
resnet_50.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [None]:
for name, param in resnet_50.named_parameters():
  if not name.startswith('fc'):
    param.requires_grad = False

In [7]:
normalize = transforms.Normalize(
   mean=[0.485, 0.456, 0.406],
   std=[0.229, 0.224, 0.225]
)
trans = transforms.Compose([transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(),normalize])

In [21]:
big_dataset = torchvision.datasets.CIFAR10(root='.', train=True, download=True, transform=trans)
train_size = int(0.8 * len(big_dataset))
eval_size = len(big_dataset) - train_size
train_dataset, eval_dataset = torch.utils.data.random_split(big_dataset, [train_size, eval_size])
test_dataset = torchvision.datasets.CIFAR10(root='.', train=False, download=True, transform=trans)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

Files already downloaded and verified
Files already downloaded and verified


In [9]:
def calculate_accuracy(logits, labels):
  predictions = torch.argmax(logits, dim=1)
  true = torch.sum(predictions==labels)
  return true

def evaluate(dataloader, model):
  loss_function = nn.CrossEntropyLoss()
  eval_loss = 0
  total_true = 0
  for x,y in dataloader:
    with torch.no_grad():
      x = x.to(device)
      y = y.to(device)
      logits = model(x)
      loss = loss_function(logits, y)
      eval_loss+= loss.item()
      total_true+= calculate_accuracy(logits, y)
  eval_acuuracy = total_true/len(dataloader.dataset)
  return eval_loss, eval_acuuracy

In [10]:
def train_loop(model, train_dataloader, eval_dataloader):
  optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
  loss_function = nn.CrossEntropyLoss(reduction='mean')
  for epoch in range(0,EPOCH_NUM):
    train_loss =0
    for x,y in tqdm(train_dataloader):
      x = x.to(device)
      y = y.to(device)
      logits = model(x)
      loss = loss_function(logits, y)
      train_loss+= loss.item()
      optimizer.zero_grad()
      loss.backward()
      torch.nn.utils.clip_grad_value_(model.parameters(), 6)
      optimizer.step()
      train_loss +=loss.item()
    eval_loss, eval_acuuracy = evaluate(eval_dataloader, model)
    print('train loss ', train_loss)
    print('eval loss ', eval_loss)
    print('accuracy ', eval_acuuracy)
  return model

In [11]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
def test_model(model, test_dataloader):
  y_true = torch.tensor([])
  y_pred = torch.tensor([])
  total_true = 0
  for x,y in tqdm(test_dataloader):
    x = x.to(device)
    y = y.to(device)
    logits = model(x)
    total_true+= calculate_accuracy(logits, y)
    predictions = torch.argmax(logits, dim=1)
    y_pred = torch.cat((y_pred, torch.ravel(predictions).cpu()))
    y_true = torch.cat((y_true, torch.ravel(y).cpu()))
  test_acuuracy = total_true/len(test_dataloader.dataset)
  metrics = precision_recall_fscore_support(y_true, y_pred, average='weighted')
  print()
  print('Precision/test', metrics[0])
  print('Recall/test', metrics[1])
  print('F1Score/test', metrics[2])
  print('Number of parameters',sum(p.numel() for p in model.parameters()))
  print(confusion_matrix(y_true, y_pred))

In [None]:
fine_tuned_resnet_50 = train_loop(resnet_50, train_dataloader, eval_dataloader)

100%|██████████| 5000/5000 [03:19<00:00, 25.06it/s]


train loss  22502.283234119415
eval loss  2667.394275665283
accuracy  tensor(0.3346, device='cuda:0')


100%|██████████| 5000/5000 [03:22<00:00, 24.66it/s]


train loss  18612.557124853134
eval loss  1981.3908643722534
accuracy  tensor(0.5331, device='cuda:0')


100%|██████████| 5000/5000 [03:24<00:00, 24.51it/s]


train loss  13907.44877231121
eval loss  1540.2291423082352
accuracy  tensor(0.5926, device='cuda:0')


100%|██████████| 5000/5000 [03:28<00:00, 24.00it/s]


train loss  11428.561421275139
eval loss  1342.0577595829964
accuracy  tensor(0.6329, device='cuda:0')


100%|██████████| 5000/5000 [03:22<00:00, 24.75it/s]


train loss  10342.039060235023
eval loss  1206.8215301781893
accuracy  tensor(0.6698, device='cuda:0')


100%|██████████| 5000/5000 [03:21<00:00, 24.78it/s]


train loss  9534.739519625902
eval loss  1133.3978618234396
accuracy  tensor(0.6920, device='cuda:0')


100%|██████████| 5000/5000 [03:26<00:00, 24.22it/s]


train loss  8891.77366182208
eval loss  1081.8541451245546
accuracy  tensor(0.7025, device='cuda:0')


100%|██████████| 5000/5000 [03:31<00:00, 23.63it/s]


train loss  8535.232638508081
eval loss  1032.1145451515913
accuracy  tensor(0.7184, device='cuda:0')


100%|██████████| 5000/5000 [03:21<00:00, 24.76it/s]


train loss  8231.782306239009
eval loss  1001.1988038867712
accuracy  tensor(0.7239, device='cuda:0')


100%|██████████| 5000/5000 [03:20<00:00, 24.88it/s]


train loss  7952.466310620308
eval loss  983.7344431951642
accuracy  tensor(0.7326, device='cuda:0')


In [43]:
%cd drive/MyDrive/DL/HW3

/content/drive/MyDrive/DL/HW3


In [None]:
torch.save(fine_tuned_resnet_50,'fine_tuned_resnet_50.pt')
test_model(fine_tuned_resnet_50, test_dataloader)

100%|██████████| 1250/1250 [00:50<00:00, 24.75it/s]


Precision/test 0.7330903990835577
Recall/test 0.7324
F1Score/test 0.7314547697061752
Number of parameters 24041130
[[736  33  43  15  19   2   7  13 105  27]
 [ 21 817  12   7   1   5  16   5  30  86]
 [ 64   6 692  46  46  33  86   9  18   0]
 [ 13  11  52 573  49 128 101  19  29  25]
 [ 30   6  72  37 658  21  99  59  13   5]
 [  8   6  37 122  19 717  34  43  11   3]
 [  7   8  33  45  39  22 829   3   8   6]
 [ 29   2  30  46  84  56  16 707  13  17]
 [ 92  33  15  16   9   4  11   4 781  35]
 [ 22  90   3  13   5   4  15  11  23 814]]





##Knowledge Distillation

In [44]:
fine_tuned_resnet_50 = torch.load('fine_tuned_resnet_50.pt')

In [59]:
import torch.nn.functional as F
ce_loss_fn = nn.CrossEntropyLoss(reduction='mean')
def kd_loss(s_model, t_model, input, label, a, t):
  y_s = s_model(input)
  y_t = t_model(input)
  # print(y_s)
  # print(label)
  loss_cri = ce_loss_fn(y_s, label)

  p_s = F.log_softmax(y_s/t, dim=1)
  p_t = F.softmax(y_t/t, dim=1)
  loss_kd = F.kl_div(p_s, p_t) * (t**2)

  loss = ((1-a)*loss_cri) + (a*loss_kd)
  return loss

In [63]:
from torchvision.models import resnet18, ResNet18_Weights

resnet_18 = resnet18(weights=None)
resnet_18.fc = nn.Sequential(
                              nn.Linear(512, 128),
                              nn.ReLU(),
                              nn.Linear(128, 32),
                              nn.ReLU(),
                              nn.Linear(32, 10),
                              )
resnet_18.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [65]:
def kd_train_loop(s_model, t_model, train_dataloader, eval_dataloader):
  a = 0.9
  t = 5
  optimizer = torch.optim.SGD(s_model.parameters(), lr=LEARNING_RATE)
  for epoch in range(0,EPOCH_NUM):
    train_loss =0
    for x,y in tqdm(train_dataloader):
      x = x.to(device)
      y = y.to(device)
      loss = kd_loss(s_model, t_model, x, y, a,t)
      train_loss+= loss.item()
      optimizer.zero_grad()
      loss.backward()
      torch.nn.utils.clip_grad_value_(s_model.parameters(), 6)
      optimizer.step()
      train_loss +=loss.item()
    eval_acuuracy = evaluate(eval_dataloader, s_model)
    print('train loss ', train_loss)
    print('accuracy ', eval_acuuracy)
  return s_model

In [66]:
kd_resnet = kd_train_loop(resnet_18, fine_tuned_resnet_50, train_dataloader, eval_dataloader)
torch.save(kd_resnet,'kd_resnet.pt')
test_model(kd_resnet, test_dataloader)

100%|██████████| 2500/2500 [05:15<00:00,  7.92it/s]


train loss  1686.45721668005
accuracy  (903.0800580382347, tensor(0.4512, device='cuda:0'))


100%|██████████| 2500/2500 [05:18<00:00,  7.85it/s]


train loss  1651.2186085879803
accuracy  (894.5373405218124, tensor(0.4592, device='cuda:0'))


100%|██████████| 2500/2500 [05:17<00:00,  7.88it/s]


train loss  1620.7571497559547
accuracy  (883.0321865677834, tensor(0.4661, device='cuda:0'))


100%|██████████| 2500/2500 [05:15<00:00,  7.93it/s]


train loss  1595.8544882237911
accuracy  (870.0824590325356, tensor(0.4729, device='cuda:0'))


100%|██████████| 2500/2500 [05:13<00:00,  7.97it/s]


train loss  1562.1003461778164
accuracy  (863.5035519003868, tensor(0.4814, device='cuda:0'))


100%|██████████| 2500/2500 [05:14<00:00,  7.95it/s]


train loss  1527.2185311317444
accuracy  (854.5254962444305, tensor(0.4861, device='cuda:0'))


100%|██████████| 2500/2500 [05:14<00:00,  7.95it/s]


train loss  1492.6202908158302
accuracy  (832.9651935100555, tensor(0.5012, device='cuda:0'))


100%|██████████| 2500/2500 [05:16<00:00,  7.90it/s]


train loss  1456.3119283616543
accuracy  (820.9043077230453, tensor(0.5149, device='cuda:0'))


100%|██████████| 2500/2500 [05:14<00:00,  7.94it/s]


train loss  1415.5517560243607
accuracy  (811.4798835515976, tensor(0.5223, device='cuda:0'))


100%|██████████| 2500/2500 [05:19<00:00,  7.83it/s]


train loss  1387.0457858145237
accuracy  (802.3046686053276, tensor(0.5287, device='cuda:0'))


100%|██████████| 625/625 [00:26<00:00, 23.69it/s]


Precision/test 0.5226834292031292
Recall/test 0.5371
F1Score/test 0.5231430183251146
Number of parameters 11246634
[[574  40  68  19  13   4   9  19 204  50]
 [ 28 704   3  10   3   4  13  12  39 184]
 [119  11 313  64 139 111 182  30  26   5]
 [ 37  22 106 152  43 358 157  59  37  29]
 [ 48  12  94  32 359  83 219 132  14   7]
 [ 15   5  70  95  71 612  39  67  18   8]
 [ 13  18  91  52  58  43 694  17  11   3]
 [ 27  11  28  37  98 117  42 593   6  41]
 [123  61  24  13   4  11   9   9 721  25]
 [ 34 196   5   9   1   2   5  59  40 649]]





##Resnet18 Training

In [15]:
from torchvision.models import resnet18, ResNet18_Weights

resnet_18 = resnet18(weights=None)
resnet_18.fc = nn.Sequential(
                              nn.Linear(512, 128),
                              nn.ReLU(),
                              nn.Linear(128, 32),
                              nn.ReLU(),
                              nn.Linear(32, 10),
                              )
resnet_18.to(device)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [16]:
trained_resnet_18 = train_loop(resnet_18, train_dataloader, eval_dataloader)
torch.save(trained_resnet_18,'trained_resnet_18.pt')
test_model(trained_resnet_18, test_dataloader)

100%|██████████| 5000/5000 [03:44<00:00, 22.29it/s]


train loss  21120.777870178223
eval loss  2383.0880279541016
accuracy  tensor(0.2658, device='cuda:0')


100%|██████████| 5000/5000 [03:43<00:00, 22.39it/s]


train loss  18013.45772600174
eval loss  2204.1189110279083
accuracy  tensor(0.3184, device='cuda:0')


100%|██████████| 5000/5000 [03:39<00:00, 22.80it/s]


train loss  16928.778424263
eval loss  2062.765794336796
accuracy  tensor(0.3685, device='cuda:0')


100%|██████████| 5000/5000 [03:33<00:00, 23.37it/s]


train loss  15758.037504196167
eval loss  1902.732839167118
accuracy  tensor(0.4257, device='cuda:0')


100%|██████████| 5000/5000 [03:35<00:00, 23.21it/s]


train loss  14643.094744443893
eval loss  1770.9907153844833
accuracy  tensor(0.4792, device='cuda:0')


100%|██████████| 5000/5000 [03:36<00:00, 23.04it/s]


train loss  13565.447589993477
eval loss  1665.5815932750702
accuracy  tensor(0.5144, device='cuda:0')


100%|██████████| 5000/5000 [03:36<00:00, 23.06it/s]


train loss  12674.753957748413
eval loss  1570.9828049242496
accuracy  tensor(0.5402, device='cuda:0')


100%|██████████| 5000/5000 [03:35<00:00, 23.18it/s]


train loss  11847.021350175142
eval loss  1502.4725103080273
accuracy  tensor(0.5683, device='cuda:0')


100%|██████████| 5000/5000 [03:35<00:00, 23.18it/s]


train loss  11041.400986522436
eval loss  1432.9547441601753
accuracy  tensor(0.5941, device='cuda:0')


100%|██████████| 5000/5000 [03:34<00:00, 23.31it/s]


train loss  10322.641784459352
eval loss  1336.234493970871
accuracy  tensor(0.6137, device='cuda:0')


100%|██████████| 1250/1250 [00:27<00:00, 45.49it/s]


Precision/test 0.622217934353782
Recall/test 0.613
F1Score/test 0.6127106167571205
Number of parameters 11246634
[[630  24  91  21   8  14  17  27 104  64]
 [ 18 699   4  16   4   8  17   6  27 201]
 [ 81   7 492  91  68  96  96  43  21   5]
 [ 21   7  99 370  30 301 109  25  17  21]
 [ 28   5 173  64 414  91  98 109  14   4]
 [  8   3  66 198  24 605  38  41   8   9]
 [  7   4  91 100  37  23 719   7   8   4]
 [ 22   4  37  32  54 160  11 656   1  23]
 [ 78  32  41  15   6   9   6   3 755  55]
 [ 24  67   4  29   7  18  13  24  24 790]]





I expected the student model to perform better than this trained resnet18, since it has more information, but the results did not support this idea



##Training Resnet50

In [24]:
from torchvision.models import resnet50, ResNet50_Weights

resnet_50 = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
resnet_50.fc = nn.Sequential(
                              nn.Linear(2048, 256),
                              nn.ReLU(),
                              nn.Linear(256, 32),
                              nn.ReLU(),
                              nn.Linear(32, 10),
                              )
resnet_50.to(device)
trained_resnet_50 = train_loop(resnet_50, train_dataloader, eval_dataloader)
torch.save(trained_resnet_50,'trained_resnet_50.pt')
test_model(trained_resnet_50, test_dataloader)

100%|██████████| 2500/2500 [07:55<00:00,  5.26it/s]


train loss  11435.230113983154
eval loss  1408.302878856659
accuracy  tensor(0.2607, device='cuda:0')


100%|██████████| 2500/2500 [07:49<00:00,  5.32it/s]


train loss  10591.749562501907
eval loss  1143.2495769262314
accuracy  tensor(0.4778, device='cuda:0')


100%|██████████| 2500/2500 [07:48<00:00,  5.33it/s]


train loss  6472.559462487698
eval loss  513.0970579981804
accuracy  tensor(0.7718, device='cuda:0')


100%|██████████| 2500/2500 [07:48<00:00,  5.33it/s]


train loss  2917.847610577941
eval loss  280.3869911581278
accuracy  tensor(0.8591, device='cuda:0')


100%|██████████| 2500/2500 [07:48<00:00,  5.33it/s]


train loss  1862.5017370432615
eval loss  215.57726021111012
accuracy  tensor(0.8870, device='cuda:0')


100%|██████████| 625/625 [00:45<00:00, 13.72it/s]


Precision/test 0.8865874115355791
Recall/test 0.8855
F1Score/test 0.8857192994413617
Number of parameters 24041130
[[884   1  14   2  12   0   6  13  51  17]
 [ 10 905   1   3   0   0   6   1  18  56]
 [ 37   0 849  33  39  12  22   8   0   0]
 [ 10   3  11 836  25  67  28  10   6   4]
 [  7   0  16  32 886   4  16  37   1   1]
 [  1   0  12  99  13 843   2  26   2   2]
 [  8   1  15  30  11   4 928   0   3   0]
 [ 11   0  11  19  43  24   1 888   2   1]
 [ 47   9   4   2   0   0   4   0 920  14]
 [ 16  39   1   6   1   0   2   4  15 916]]





Training the whole model takes more time but it has more flexibility since there are more active nodes. With the help of the large dataset the results improved on this case.