<a href="https://colab.research.google.com/github/liuyao12/imagenette_experiments/blob/master/ResNet_twist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ResNet with a Twist

> with depthwise (x4) + Ranger + Mish + SA + MaxBlurPool + ResTrick

See blog https://liuyao12.github.io/blog/research/2020/03/07/Conv-Twist.html

See summary at https://forums.fast.ai/t/imagenette-imagewoof-leaderboards/45822/47?u=liuyao 

## Imagewoof Leaderboard

Imagewoof is a small subset of ImageNet, consisting of 10 dog breeds, courtesy of fast.ai and Jeremy Howard.

(Imagewoof2, with a 70/30 train/test ratio)

| Size (px) | Epochs | SoTA| x2 | x4 | x4 twist | x6 | x4 double | runs |
|--|--|--| --|--| --|--|--|--|
|128|5|73.37|75.19|76.27||76.61| **82.12**|5, mean
|128|20|85.52|85.18|86.22||86.27| **88.93**|5, mean
|128|80|87.20|87.70|87.83||87.65| **90.15**|1
|128|200|87.20|
|192|5|77.87|79.86|81.15|80.73|| **82.69**|5, mean
|192|20|87.85|88.12|88.37|88.28|
|192|80|89.21|90.30|90.25|89.38|90.37| **92.08** |
|192|200|89.54
|256|5|
|256|20|
|256|80|
|256|200|


# setup and imports

In [None]:
pip install git+https://github.com/ayasyrev/model_constructor

In [None]:
pip install git+https://github.com/ayasyrev/imagenette_experiments

In [4]:
from fastai.basic_train import *
from fastai.vision import *
# from fastai.script import *

In [5]:
# pip install kornia
from kornia.contrib import MaxBlurPool2d

In [6]:
from imagenette_experiments.train_utils import *
from model_constructor.net import Net, act_fn
from model_constructor.layers import SimpleSelfAttention, ConvLayer

# ResBlock

In [7]:
class MnM(nn.Module): # Mix and Multiply
    def __init__(self, channels, group_size):
        super().__init__()
        self.channels = channels
        self.gs = group_size
        n = channels//group_size*4
        self.conv = nn.Conv2d(n, n*2, 1, groups=n, bias=True)
        self.XY = None

    def forward(self, x): 
        N,C,H,W = x.size()
        # x1 = x.view(N,-1,self.gs,H,W)[:,:,:-2].reshape(N,-1,H,W)
        x2 = x.view(N,-1,self.gs,H,W)[:,:,:4].reshape(N,-1,H,W)
        if self.XY is None:
            XX = torch.from_numpy(np.indices((1,H,W))[2]*2/W-1)
            YY = torch.from_numpy(np.indices((1,H,W))[1]*2/H-1)
            g = self.channels//self.gs*4
            self.XY = torch.cat([XX,YY]*g, dim=0).to(x.device).type(x.dtype)
        twist = self.conv(x2) * self.XY
        twist = torch.sum(twist.view(N,-1,2,4,H,W), dim=2).reshape(N,-1,H,W)
        return torch.cat([x, twist], dim=1)

In [8]:
class NewLayer(nn.Sequential):
    """Basic conv layers block"""
    def __init__(self, ni, nf, ks=3, stride=1,
            act=True,  act_fn=nn.ReLU(inplace=True),
            bn_layer=True, bn_1st=True, zero_bn=False,
            padding=None, bias=False, groups=1, **kwargs):

        if padding==None: padding = ks//2
        if ks==3 and groups==1:  # to be used for the "stem" of ResNet
          # if ni==3: stride = 2
          layers = [('Conv3x3', nn.Conv2d(ni, ni*dm, 3, stride=stride, padding=1, bias=bias, groups=ni)),
                    ('Conv1x1', nn.Conv2d(ni*dm, nf, 1, bias=bias, groups=1))]
        else:
          layers = [('Conv{}x{}'.format(ks,ks), 
                      nn.Conv2d(ni, nf, ks, stride=stride, padding=padding, bias=bias, groups=groups))]

        act_bn = [('act_fn', act_fn)] if act else []
        if bn_layer:
            bn = nn.BatchNorm2d(nf)
            nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
            act_bn += [('bn', bn)]
        if bn_1st: act_bn.reverse()
        layers += act_bn
        super().__init__(OrderedDict(layers))

In [9]:
class NewResBlock(Module):
    def __init__(self, expansion, ni, nh, stride=1,
                 conv_layer=ConvLayer, act_fn=act_fn, zero_bn=True, bn_1st=True,
                 pool=nn.AvgPool2d(2, ceil_mode=True), sa=False, sym=False, groups=1):
        nf,ni = nh*expansion,ni*expansion
        conv_layer = NewLayer
        self.reduce = noop if stride==1 else pool
        layers  = [(f"conv_0", conv_layer(ni, nh, 3, act_fn=act_fn, bn_1st=bn_1st)),
                   (f"conv_1", conv_layer(ni, nf, 3, zero_bn=zero_bn, act=False, bn_layer=True))
        ] if expansion == 1 else [
                   (f"conv_0", conv_layer(ni, nh, 1, act_fn=act_fn, bn_1st=bn_1st)),
                   # (f"conv_1", conv_layer(nh, nh, 3, act_fn=act_fn, bn_1st=bn_1st)),
                   (f"conv_1", conv_layer(nh, nh*dm, 3, groups=nh, act_fn=act_fn, bn_1st=bn_1st)),
                   # (f"conv_1", conv_layer(nh, nh*dm, 3, groups=nh, act=False, bn_layer=False)),
                   (f"MnM", MnM(nf, dm)),
                   (f"conv_2", conv_layer(nh*(dm+4), nf, 1, zero_bn=zero_bn, act=False, bn_1st=bn_1st))
        ]
        if sa: layers.append(('sa', SimpleSelfAttention(nf,ks=1,sym=sym)))
        self.convs = nn.Sequential(OrderedDict(layers))
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False, bn_1st=bn_1st)
        self.merge = act_fn

    def forward(self, x):
        o = self.reduce(x)
        return self.merge(self.convs(o) + self.idconv(o))

# Model Constructor

In [10]:
model = Net(c_out=10, layers=[3,6,8,3], expansion=4)
model.block = NewResBlock
model.conv_layer = NewLayer # for the stem
pool = MaxBlurPool2d(3, True)
model.pool = pool
model.stem_sizes = [3,32,64,64]
model.act_fn = Mish()
model.sa = True

## Experiment

In [11]:
dm = 4
res = dict()
for ep in [80]: #*5 + [20] + [80]:
    mixup=0 if ep<=20 else 0.2
    learn = get_learn(model=model, size=192, bs=16, mixup=mixup)
    learn.fit_fc(ep, lr=4e-3, moms=(0.95,0.95), start_pct=0.72)
    acc = learn.recorder.metrics[-1][0].item()
    res[ep] = res[ep] + [acc] if ep in res else [acc]
    print('{} epochs: {} ({} runs)'.format(ep, sum(res[ep])/len(res[ep]), len(res[ep])))
print('depth multiplier={}'.format(dm), {ep: sum(res[ep])/len(res[ep]) for ep in res})

Downloading https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2.tgz


data path   /root/.fastai/data/imagewoof2




Learn path /root/.fastai/data/imagewoof2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.007803,1.790987,0.4141,0.878086,03:34
1,1.794687,1.500919,0.561212,0.937389,03:33
2,1.620749,1.284334,0.673199,0.956986,03:33
3,1.526108,1.198381,0.71265,0.966404,03:33
4,1.462588,1.147985,0.740646,0.972512,03:34
5,1.375338,1.057145,0.782387,0.972258,03:34
6,1.358792,1.044155,0.780606,0.976584,03:33
7,1.303343,0.977458,0.812166,0.977093,03:34
8,1.248779,0.964249,0.821838,0.975821,03:33
9,1.20088,0.952638,0.819292,0.978875,03:34


	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


80 epochs: 0.8964112997055054 (1 runs)
depth multiplier=4 {80: 0.8964112997055054}


In [1]:
print('depth multiplier={}'.format(dm), {ep: sum(res[ep])/len(res[ep]) for ep in res})
for i in range(80):
  print(i, learn.recorder.val_losses[i].item(), learn.recorder.metrics[i][0].item())

NameError: ignored

## Experiment with changing model during training

In [30]:
state = learn.model.state_dict()

In [31]:
epochs = 10
mixup = 0 if epochs<=20 else 0.2
learn2 = get_learn(model=model, size=192, bs=16, mixup=mixup)
learn2.model.load_state_dict(state)
print(learn2.validate())

data path   /root/.fastai/data/imagewoof2




Learn path /root/.fastai/data/imagewoof2




[0.7742255, tensor(0.8972), tensor(0.9842)]


In [32]:
learn2.fit_fc(10, lr=4e-6, moms=(0.95,0.95), start_pct=0.72)
print(learn2.recorder.metrics[-1][0].item())

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,0.535328,0.776316,0.895902,0.983456,02:06
1,0.534004,0.773021,0.898702,0.984983,02:06
2,0.531789,0.771603,0.89692,0.983965,02:06
3,0.527236,0.77158,0.898193,0.983965,02:05
4,0.5321,0.770064,0.899211,0.984474,02:05
5,0.528425,0.768916,0.898956,0.984983,02:05
6,0.531735,0.769262,0.897938,0.985747,02:06
7,0.532737,0.76767,0.898193,0.985747,02:06
8,0.531163,0.768917,0.898193,0.984729,02:06
9,0.533564,0.768265,0.897429,0.984729,02:05




0.897429347038269


In [33]:
channels = 0
for name in state:
  if 'Conv3x3' in name and channels==0:
    a,b,c,d = state[name].size()
    double = state[name].unsqueeze(1).expand(a,2,b,c,d).reshape(a*2,b,c,d)
    state[name] = double / 1.414
    channels = a
  if 'Conv1x1' in name and state[name].size()[1]==channels:
    a,b,c,d = state[name].size()
    double = state[name].unsqueeze(2).expand(a,b,2,c,d).reshape(a,b*2,c,d)
    state[name] = double / 1.414
    channels = 0
print('done doubling')

done doubling


In [38]:
dm = 4
mixup = 0 if epochs<=20 else 0.2
learn2 = get_learn(model=model, size=192, bs=16, mixup=mixup)
learn2.model.load_state_dict(state)
print(learn2.validate())

data path   /root/.fastai/data/imagewoof2




Learn path /root/.fastai/data/imagewoof2


[0.7742505, tensor(0.8972), tensor(0.9842)]


In [39]:
learn2.fit_fc(10, lr=4e-4, moms=(0.95,0.95), start_pct=0.72)
print(learn2.recorder.metrics[-1][0].item())

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,0.531288,0.768768,0.896157,0.984983,02:27
1,0.526129,0.769331,0.894375,0.983202,02:27
2,0.528508,0.773641,0.894884,0.980148,02:27
3,0.527456,0.772259,0.894121,0.981675,02:27
4,0.524152,0.772953,0.894884,0.982947,02:27
5,0.521762,0.776454,0.891575,0.980911,02:27
6,0.525762,0.774231,0.896157,0.981166,02:27
7,0.522501,0.77075,0.897938,0.981675,02:26
8,0.522169,0.769787,0.894884,0.982947,02:27
9,0.518775,0.765697,0.895648,0.982438,02:26




0.8956477642059326


In [37]:
learn2.fit_fc(10, lr=4e-5, moms=(0.95,0.95), start_pct=0.72)
print(learn2.recorder.metrics[-1][0].item())

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,0.532618,0.770861,0.897684,0.983965,02:27
1,0.532992,0.766575,0.898447,0.985238,02:27
2,0.530577,0.767157,0.897429,0.984729,02:27
3,0.524347,0.767554,0.898447,0.985238,02:27
4,0.530132,0.766334,0.897684,0.984474,02:27
5,0.527259,0.765086,0.896157,0.984729,02:27
6,0.524162,0.766871,0.89692,0.984474,02:27
7,0.529729,0.76505,0.898702,0.986002,02:27
8,0.533118,0.765301,0.898956,0.983711,02:27
9,0.530292,0.76303,0.899211,0.984983,02:27




0.8992109894752502


In [35]:
learn2.fit_fc(10, lr=4e-6, moms=(0.95,0.95), start_pct=0.72)
print(learn2.recorder.metrics[-1][0].item())

epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,0.533471,0.772939,0.898447,0.98422,02:28
1,0.533423,0.770602,0.898702,0.985238,02:28
2,0.534358,0.771039,0.899211,0.983202,02:27
3,0.538733,0.769044,0.898702,0.985238,02:28
4,0.5355,0.769227,0.898447,0.984983,02:28
5,0.524419,0.770779,0.898193,0.983965,02:28
6,0.534346,0.769639,0.898193,0.983965,02:28
7,0.533792,0.768531,0.899975,0.984983,02:28
8,0.532404,0.770487,0.897175,0.98422,02:28
9,0.529191,0.768891,0.899211,0.984729,02:28




0.8992109894752502


In [None]:
dm = 6
for ep in [80]: #*5 + [20] + [80]:
    mixup=0 if ep<=20 else 0.2
    learn = get_learn(model=model, size=192, bs=16, mixup=mixup)
    learn.fit_fc(ep, lr=4e-3, moms=(0.95,0.95), start_pct=0.72)
    acc = learn.recorder.metrics[-1][0].item()
    res[ep] = res[ep] + [acc] if ep in res else [acc]
    print('{} epochs: {} ({} runs)'.format(ep, sum(res[ep])/len(res[ep]), len(res[ep])))
print('depth multiplier={}'.format(dm), {ep: sum(res[ep])/len(res[ep]) for ep in res})

data path   /root/.fastai/data/imagewoof2




Learn path /root/.fastai/data/imagewoof2


epoch,train_loss,valid_loss,accuracy,top_k_accuracy,time
0,2.020703,1.802997,0.427335,0.867651,03:09
1,1.779585,1.50151,0.564775,0.925681,03:09
2,1.65409,1.303978,0.665564,0.956732,03:09
3,1.513084,1.190918,0.718503,0.963858,03:09
4,1.462979,1.105817,0.762026,0.970476,03:09
5,1.379337,1.06335,0.775515,0.97353,03:09
6,1.32355,1.003739,0.801731,0.972767,03:09
7,1.273628,0.989483,0.81013,0.980657,03:09
8,1.264744,1.045115,0.777806,0.977348,03:08
9,1.220185,0.947007,0.821583,0.978621,03:09




80 epochs: 0.9008229374885559 (3 runs)
depth multiplier=6 {80: 0.9008229374885559}


In [None]:
res

{80: [0.902519702911377, 0.8961567878723145]}

In [None]:
for i in range(80):
  print(i, learn.recorder.val_losses[i].item(), learn.recorder.metrics[i][0].item())

0 1.8019211292266846 0.40519216656684875
1 1.5199214220046997 0.5612115263938904
2 1.334391713142395 0.6543650031089783
3 1.2439961433410645 0.6948332786560059
4 1.1527239084243774 0.7291931509971619
5 1.1206214427947998 0.7495545744895935
6 1.052816390991211 0.7869687080383301
7 1.0666406154632568 0.7770425081253052
8 0.9904943704605103 0.8096207976341248
9 1.0099091529846191 0.7938406467437744
10 0.9543590545654297 0.824128270149231
11 0.956939160823822 0.8218376040458679
12 0.9327539205551147 0.8266734480857849
13 0.9148764610290527 0.8327818512916565
14 0.8986201882362366 0.8434716463088989
15 0.9222990274429321 0.8376176953315735
16 0.9147737622261047 0.8355816006660461
17 0.9077925682067871 0.8429625630378723
18 0.9136068820953369 0.8317638039588928
19 0.8858543038368225 0.8600152730941772
20 0.8733745813369751 0.8539068698883057
21 0.8703872561454773 0.8551794290542603
22 0.9229952692985535 0.8401628732681274
23 0.8630534410476685 0.8577246069908142
24 0.8858904838562012 0.85670