In [2]:
import os
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from models import model_dict
from dataset.cifar100 import *
import torchinfo
from models.resnet import ResNet
import torch.optim as optim
import time
import argparse
import matplotlib.pyplot as plt
from munch import Munch
from torchsummary import summary

In [101]:
opt = {'model_t': 'vgg13',
       'model_s': 'vgg8',
       'path_t': './save/models/vgg13_best.pth',
       'cuda_visible_devices': 0,
       'trial': 0,
       'balance': .2,
       'temp': 3.0}

opt = Munch(opt)

MAX_CHANNEL = 640

In [102]:
train_batch_size = 128
test_batch_size = 100
n_cls = 100
num_workers = 1
total_epoch = 240
learning_rate = 0.05
lr_decay_epoch = [150, 180, 210]
lr_decay_rate = 0.1
weight_decay = 5e-4
momentum = 0.9
print_freq = 100

balance = opt.balance
model_t = model_dict[opt.model_t](num_classes=100)
model_s = model_dict[opt.model_s](num_classes=100)
path_t = opt.path_t
model_t.load_state_dict(torch.load(path_t)['model'])
model_t.eval()

path_t = opt.path_t
trial = 0
r = 1
a = 0
b = 1
p = 1
d = 1
kd_T = 4
fm_beta = 1e-2
fit_alpha = 0.2

if torch.cuda.is_available():
    model_s.cuda()
    model_t.cuda()
    
class SepConv(nn.Module):
    def __init__(self, channel_in, channel_out, kernel_size=3, stride=2, padding=1, affine=True, max_channel=MAX_CHANNEL):
        super(SepConv, self).__init__()
        self.cadinality = channel_in * (max_channel / channel_in)
        inter_channel = int(self.cadinality)
        self.op = nn.Sequential(
            nn.Conv2d(channel_in, channel_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=channel_in, bias=False),
            nn.Conv2d(channel_in, inter_channel, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(inter_channel, affine=affine),
            nn.ReLU(inplace=False),
            nn.Conv2d(inter_channel, inter_channel, kernel_size=kernel_size, stride=1, padding=padding, groups=inter_channel, bias=False),
            nn.Conv2d(inter_channel, channel_out, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(channel_out, affine=affine),
            nn.ReLU(inplace=False),
        )

    def forward(self, x):
        return self.op(x)


def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res


class TempDistiller(nn.Module):
    def __init__(self, in_channels, in_resolution, final_dim=64, num_classes=100):
        super(TempDistiller, self).__init__()
        self.module_list = []
        temp = in_resolution
        if temp <= 4:
            self.module_list.append(SepConv(
                channel_in=in_channels,
                channel_out = final_dim,
                stride=1))
        
        while temp > 4:
            self.module_list.append(SepConv(
                channel_in= in_channels,
                channel_out= final_dim if temp == 8 else in_channels*2 
            ))
#             print(temp)
            in_channels *= 2
            temp /= 2
        
        self.module_list.append(nn.AvgPool2d((4, 4), 1))
        self.module_list = nn.Sequential(*self.module_list)
        self.fc = nn.Linear(final_dim, num_classes)

    def forward(self, in_feat):
        out = self.module_list(in_feat)
        out = out.view(out.size(0), -1)
        feature = out
        out_fc = self.fc(feature)
        return out_fc, feature


## added in exp1_8
class FinalClassifier(nn.Module):
    def __init__(self, in_channels, final_dim=64, num_classes=100):
        super(FinalClassifier, self).__init__()
        self.fc2 = nn.Linear(in_channels, num_classes)
    
    def forward(self, x):
        out_fc = self.fc2(x)
        return out_fc, x


class SimpleMLP(nn.Module):
    def __init__(self, input_size, num_classes=100):
        super(SimpleMLP, self).__init__()

        self.simple = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.BatchNorm1d(128),
            nn.ReLU6(),
            nn.Linear(128, num_classes)
        )

    def forward(self, input):
        return self.simple(input)
    
rand_ = torch.rand((1, 3, 32, 32), dtype=torch.float32).cuda()
out_feat, out_x = model_t(rand_, is_feat=True, preact=False)

# print("Batch size = {}".format(2))
for i, o in enumerate(out_feat):
    print("{} feature: {}".format(i+1, o.shape))
print("final feature: {}".format(out_x.shape))
final_dim = out_feat[-1].size(1)

conv_distiller = nn.ModuleList([TempDistiller(in_channels=f.size(1), in_resolution=f.size(2), final_dim=final_dim).cuda() for f in out_feat[:-1]])
conv_distiller.append(FinalClassifier(out_feat[-1].size(1), num_classes=n_cls))
MLP = SimpleMLP(out_feat[-1].size(1), 100).cuda()

trainable_models = nn.ModuleList([])
trainable_models.model_s = model_s
trainable_models.MLP = MLP
trainable_models.conv_distiller = conv_distiller
optimizer = optim.SGD(trainable_models.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)

best_accuracy = -1
best_ensemble_acc = -1

1 feature: torch.Size([1, 64, 32, 32])
2 feature: torch.Size([1, 128, 16, 16])
3 feature: torch.Size([1, 256, 8, 8])
4 feature: torch.Size([1, 512, 4, 4])
5 feature: torch.Size([1, 512, 4, 4])
6 feature: torch.Size([1, 512])
final feature: torch.Size([1, 100])


In [103]:
print("===============================model_t=====================================")
torchinfo.summary(model_t)
print("================================model_s===================================")
torchinfo.summary(model_s)
print("=============================conv_distiller================================")
torchinfo.summary(conv_distiller)
conv_param = 0
for p in conv_distiller.parameters():
    conv_param += np.prod(p.shape)

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       1,792
|    └─BatchNorm2d: 2-2                  128
|    └─ReLU: 2-3                         --
|    └─Conv2d: 2-4                       36,928
|    └─BatchNorm2d: 2-5                  128
├─Sequential: 1-2                        --
|    └─Conv2d: 2-6                       73,856
|    └─BatchNorm2d: 2-7                  256
|    └─ReLU: 2-8                         --
|    └─Conv2d: 2-9                       147,584
|    └─BatchNorm2d: 2-10                 256
├─Sequential: 1-3                        --
|    └─Conv2d: 2-11                      295,168
|    └─BatchNorm2d: 2-12                 512
|    └─ReLU: 2-13                        --
|    └─Conv2d: 2-14                      590,080
|    └─BatchNorm2d: 2-15                 512
├─Sequential: 1-4                        --
|    └─Conv2d: 2-16                      1,180,160
|    └─BatchNorm2d: 2-17        

In [104]:
params = []
for conv in conv_distiller:
    param_num = 0
    for p in conv.parameters():
        param_num += np.prod(p.shape)
    params.append(param_num)
    torchinfo.summary(conv)

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─SepConv: 2-1                      --
|    |    └─Sequential: 3-1              130,752
|    └─SepConv: 2-2                      --
|    |    └─Sequential: 3-2              254,464
|    └─SepConv: 2-3                      --
|    |    └─Sequential: 3-3              501,888
|    └─AvgPool2d: 2-4                    --
├─Linear: 1-2                            51,300
Total params: 938,404
Trainable params: 938,404
Non-trainable params: 0
Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─SepConv: 2-1                      --
|    |    └─Sequential: 3-1              254,464
|    └─SepConv: 2-2                      --
|    |    └─Sequential: 3-2              501,888
|    └─AvgPool2d: 2-3                    --
├─Linear: 1-2                            51,300
Total params: 807,652
Trainable params: 807,652
Non-trainable params: 0
Layer (type:depth-idx

In [105]:
print("ResNet32x4 - ResNet8x4")
model_t_params = 7433860
model_s_params = 1233540
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 864+64 + conv_distiller1
model_s_ensemble2 = 18432+128+36864+128+2176 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 73728+256+147456+256+8448 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 294912+512+589824+512+33280 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

ResNet32x4 - ResNet8x4
Teacher model: 7433860 (100.000%)
Student model: 1233540 (16.594%)
Ensemble 1: 939332 (12.636%)
Ensemble 2: 1804712 (24.277%)
Ensemble 3: 2588044 (34.814%)
Ensemble 4: 3507084 (47.177%)
Final Ensemble: 5022748 (67.566%)


In [106]:
print("ResNet110 - ResNet32")
model_t_params = 1736564
model_s_params = 472756
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432+32 + conv_distiller1
model_s_ensemble2 = 2304+32+2304+32+2304+32+2304+32+2304+32+2304+32+2304+32+2304+32+2304+32+2304+32 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 4608+64+9216+64+576+9216+64+9216+64+9216+64 +9216+64+9216+64+9216+64+9216+64+ conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 18432+128+36864+128+2176+36864+128+36864+128+36864+128+36864+128+36864+128 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

ResNet110 - ResNet32
Teacher model: 1736564 (100.000%)
Student model: 472756 (27.224%)
Ensemble 1: 938868 (54.065%)
Ensemble 2: 1769880 (101.919%)
Ensemble 3: 2402556 (138.351%)
Ensemble 4: 2645244 (152.326%)
Final Ensemble: 4261964 (245.425%)


In [107]:
print("ResNet110 - ResNet20")
model_t_params = 1736564
model_s_params = 278324
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432+32 + conv_distiller1
model_s_ensemble2 = 2304+32+2304+32+2304+32+2304+32+2304+32+2304+32 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 4608+64+9216+64+576+9216+64+9216+64+9216+64 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 18432+128+36864+128+2176+36864+128+36864+128+36864+128 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

ResNet110 - ResNet20
Teacher model: 1736564 (100.000%)
Student model: 278324 (16.027%)
Ensemble 1: 938868 (54.065%)
Ensemble 2: 1760536 (101.380%)
Ensemble 3: 2356092 (135.676%)
Ensemble 4: 2524796 (145.390%)
Final Ensemble: 4067532 (234.229%)


In [108]:
print("ResNet56 - ResNet20")
model_t_params = 861620
model_s_params = 278324
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432+32 + conv_distiller1
model_s_ensemble2 = 2304+32+2304+32+2304+32+2304+32+2304+32+2304+32 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 4608+64+9216+64+576+9216+64+9216+64+9216+64 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 18432+128+36864+128+2176+36864+128+36864+128+36864+128 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

ResNet56 - ResNet20
Teacher model: 861620 (100.000%)
Student model: 278324 (32.302%)
Ensemble 1: 938868 (108.965%)
Ensemble 2: 1760536 (204.329%)
Ensemble 3: 2356092 (273.449%)
Ensemble 4: 2524796 (293.029%)
Final Ensemble: 4067532 (472.080%)


In [109]:
print("WRN40-2 - WRN16-2")
model_t_params = 2255156
model_s_params = 703284
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432 + conv_distiller1
model_s_ensemble2 = 14432+18560 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 57536+73984 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 229760+295424+256 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

WRN40-2 - WRN16-2
Teacher model: 2255156 (100.000%)
Student model: 703284 (31.186%)
Ensemble 1: 938836 (41.631%)
Ensemble 2: 1779480 (78.907%)
Ensemble 3: 2464188 (109.269%)
Ensemble 4: 2989628 (132.569%)
Final Ensemble: 4492492 (199.210%)


In [110]:
print("WRN40-4 - WRN16-4")
model_t_params = 8972340
model_s_params = 2772020
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432 + conv_distiller1
model_s_ensemble2 = 47264+73984 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 229760+295424 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 918272+1180672+512 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

WRN40-4 - WRN16-4
Teacher model: 8972340 (100.000%)
Student model: 2772020 (30.895%)
Ensemble 1: 938836 (10.464%)
Ensemble 2: 1867736 (20.817%)
Ensemble 3: 2946108 (32.835%)
Ensemble 4: 5045564 (56.235%)
Final Ensemble: 6561228 (73.127%)


In [111]:
print("WRN40-6 - WRN16-6")
model_t_params = 20152116
model_s_params = 6206772
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
model_s_ensemble1 = 432 + conv_distiller1
model_s_ensemble2 = 98528+166272 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 516672+664320 + conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 2065536+2655744+768 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

WRN40-6 - WRN16-6
Teacher model: 20152116 (100.000%)
Student model: 6206772 (30.800%)
Ensemble 1: 938836 (4.659%)
Ensemble 2: 2011288 (9.981%)
Ensemble 3: 3745468 (18.586%)
Ensemble 4: 8467516 (42.018%)
Final Ensemble: 9995980 (49.603%)


In [112]:
print("vgg19 - vgg11")
model_t_params = 20086692
model_s_params = 9277284
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
conv_distiller6 = params[5]
model_s_ensemble1 = 1792+128 + conv_distiller1
model_s_ensemble2 = 73856+ 256 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 295168+512 + 590080+ 512 +conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 1180160+1024+2359808+1024 + model_s_ensemble3
model_s_ensemble5 = 2359808+1024+2359808+1024 + model_s_ensemble4
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Ensemble 5: {} ({:.3f}%)".format(model_s_ensemble5, (model_s_ensemble5/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

vgg19 - vgg11
Teacher model: 20086692 (100.000%)
Student model: 9277284 (46.186%)
Ensemble 1: 940324 (4.681%)
Ensemble 2: 1822088 (9.071%)
Ensemble 3: 3261548 (16.237%)
Ensemble 4: 6803564 (33.871%)
Ensemble 5: 11525228 (57.377%)
Final Ensemble: 13066492 (65.050%)


In [113]:
print("vgg19 - vgg8")
model_t_params = 20086692
model_s_params = 3965028
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
conv_distiller6 = params[5]
model_s_ensemble1 = 1792+128 + conv_distiller1
model_s_ensemble2 = 73856+ 256 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 295168+512 +conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 1180160+1024 + model_s_ensemble3
model_s_ensemble5 = 2359808+1024 + model_s_ensemble4
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Ensemble 5: {} ({:.3f}%)".format(model_s_ensemble5, (model_s_ensemble5/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

vgg19 - vgg8
Teacher model: 20086692 (100.000%)
Student model: 3965028 (19.740%)
Ensemble 1: 940324 (4.681%)
Ensemble 2: 1822088 (9.071%)
Ensemble 3: 2670956 (13.297%)
Ensemble 4: 3852140 (19.178%)
Ensemble 5: 6212972 (30.931%)
Final Ensemble: 7754236 (38.604%)


In [114]:
print("vgg13 - vgg8")
model_t_params = 9462180
model_s_params = 3965028
conv_distiller_params = conv_param # including final layer
conv_distiller1 = params[0]
conv_distiller2 = params[1]
conv_distiller3 = params[2]
conv_distiller4 = params[3]
conv_distiller5 = params[4]
conv_distiller6 = params[5]
model_s_ensemble1 = 1792+128 + conv_distiller1
model_s_ensemble2 = 73856+ 256 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 295168+512 +conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 1180160+1024 + model_s_ensemble3
model_s_ensemble5 = 2359808+1024 + model_s_ensemble4
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Ensemble 5: {} ({:.3f}%)".format(model_s_ensemble5, (model_s_ensemble5/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))

vgg13 - vgg8
Teacher model: 9462180 (100.000%)
Student model: 3965028 (41.904%)
Ensemble 1: 940324 (9.938%)
Ensemble 2: 1822088 (19.257%)
Ensemble 3: 2670956 (28.228%)
Ensemble 4: 3852140 (40.711%)
Ensemble 5: 6212972 (65.661%)
Final Ensemble: 7754236 (81.950%)


In [None]:
print("wrn40_1 - wrn16_1")
model_t_params = 569780
model_s_params = 180916
conv_distiller_params = 546420 # including final layer
conv_distiller1 = 163988
conv_distiller2 = 163988
conv_distiller3 = 133572
conv_distiller4 = 78372
conv_distiller5 = 6500
model_s_ensemble1 = 432 + conv_distiller1
model_s_ensemble2 = 4672+4672 + conv_distiller2 + model_s_ensemble1
model_s_ensemble3 = 14432+18560 +conv_distiller3 + model_s_ensemble2
model_s_ensemble4 = 57536+73984 + model_s_ensemble3
final_ensemble = model_s_params + conv_distiller_params

print("Teacher model: {} ({:.3f}%)".format(model_t_params, 100))
print("Student model: {} ({:.3f}%)".format(model_s_params, (model_s_params/model_t_params)*100) )
print("Ensemble 1: {} ({:.3f}%)".format(model_s_ensemble1, (model_s_ensemble1/model_t_params)*100))
print("Ensemble 2: {} ({:.3f}%)".format(model_s_ensemble2, (model_s_ensemble2/model_t_params)*100))
print("Ensemble 3: {} ({:.3f}%)".format(model_s_ensemble3, (model_s_ensemble3/model_t_params)*100))
print("Ensemble 4: {} ({:.3f}%)".format(model_s_ensemble4, (model_s_ensemble4/model_t_params)*100))
print("Final Ensemble: {} ({:.3f}%)".format(final_ensemble, (final_ensemble/model_t_params)*100))