In [None]:
#default_exp resnet_08

In [None]:
#export
from ModernArchitecturesFromScratch.basic_operations_01 import *
from ModernArchitecturesFromScratch.fully_connected_network_02 import *
from ModernArchitecturesFromScratch.model_training_03 import *
from ModernArchitecturesFromScratch.convolutions_pooling_04 import *
from ModernArchitecturesFromScratch.callbacks_05 import *
from ModernArchitecturesFromScratch.batchnorm_06 import *
from ModernArchitecturesFromScratch.optimizers_07 import *

# ResNet 
> Fully implemented ResNet architecture from scratch: https://arxiv.org/pdf/1512.03385.pdf

## Helper

In [None]:
#export
def get_runner(model=None, layers=None, lf=None, callbacks=[Stats([accuracy]), ProgressCallback(), HyperRecorder(['lr'])], opt=None, db=None):
    "Helper function to get a quick runner"
    if model is None:
        model = SequentialModel(*layers) if layers is not None else get_linear_model(0.1)[0]
    lf = CrossEntropy() if lf is None else lf
    db = db if db is not None else get_mnist_databunch()
    opt = opt if opt is not None else adam_opt()
    learn = Learner(model,lf,opt,db)
    return Runner(learn, callbacks)

## Nested Modules

We first need to make new classes that allow architectures that aren't straight forward passes through a defined set of layers. This is normally handled in the forward passes of pytorch with autograd. We need to be a bit more clever due to the fact that we need to define our gradients in each module.

In [None]:
#export
class NestedModel(Module):
    "NestModel that allows for a sequential model to be called withing an outer model"
    def __init__(self):
        super().__init__()
        
    def forward(self,xb): return self.layers(xb)
    
    def bwd(self, out, inp): self.layers.backward()
        
    def parameters(self):
        for p in self.layers.parameters(): yield p   
    
    def __repr__(self): return f'\nSubModel( \n{self.layers}\n)'

In [None]:
#export
class TestMixingGrads(NestedModel):
    "Test module to see if nested SequentialModels will work"
    def __init__(self):
        super().__init__()
        self.layers = SequentialModel(Linear(784, 50, True), ReLU(), Linear(50,25, False))

Testing the gradients and the outputs:

In [None]:
m = SequentialModel(TestMixingGrads(), Linear(25,10, False))
db = get_mnist_databunch()
lf = CrossEntropy()
optimizer = adam_opt()
m

(Layer1): 
SubModel( 
(Layer1): Linear(784, 50)
(Layer2): ReLU()
(Layer3): Linear(50, 25)
)
(Layer2): Linear(25, 10)

In [None]:
learn = Learner(m, CrossEntropy(), Optimizer, db)
run = Runner(learn, [CheckGrad()])

In [None]:
run.fit(1,0.1)

good
good
good
good
good
good


## Refactored Conv Layers

Before we can start making ResNets, we first define a few helper modules that abstract some of the layers:

In [None]:
#export
class AutoConv(Conv):
    "Automatic resizing of padding based on kernel size to ensure constant dimensions of input to output"
    def __init__(self, n_in, n_out, kernel_size=3, stride=1):
        padding = Padding(kernel_size // 2)
        super().__init__(n_in, n_out, kernel_size, stride, padding=padding)

In [None]:
#export
class ConvBatch(NestedModel):
    "Performs conv then batchnorm"
    def __init__(self, n_in, n_out, kernel_size=3, stride=1, **kwargs):
        self.layers = SequentialModel(AutoConv(n_in, n_out, kernel_size, stride), 
                       Batchnorm(n_out))
    
    def __repr__(self): return f'{self.layers.layers[0]}, {self.layers.layers[1]}'

In [None]:
#export
class Identity(Module):
    "Module to perform the identity connection (what goes in, comes out)"
    def forward(self,xb): return xb
    def bwd(self,out,inp): inp.g += out.g
    def __repr__(self): return f'Identity Connection'

## ResBlocks

Final built up ResNet blocks that implement the skip connecton layers characteristic of a ResNet

In [None]:
#export
class BasicRes(Module):
    "Basic block to implement the two different ResBlocks presented in the paper"
    def __init__(self, n_in, n_out, expansion=1, stride=1, Activation=ReLU, *args, **kwargs):
        super().__init__()
        self.n_in, self.n_out, self.expansion, self.stride, self.Activation = n_in, n_out, expansion, stride, Activation
        
        self.identity = Identity() if self.do_identity else AutoConv(self.n_in, self.get_expansion, kernel_size=1, stride=2)
    
    def forward(self, xb): 
        self.id_out = self.identity(xb)
        self.res_out = self.res_blocks(xb)
        self.out = self.id_out + self.res_out
        return self.out
    
    def bwd(self, out, inp):
        self.res_out.g = out.g
        self.id_out.g = out.g
        self.res_blocks.backward()
        self.identity.backward()
    
    @property
    def get_expansion(self): return self.n_out * self.expansion
    
    @property
    def do_identity(self): return self.n_in == self.n_out
    
    def parameters(self): 
        layers = [self.res_blocks, self.identity]
        for m in layers: 
            for p in m.parameters(): yield p 

In [None]:
#export
class BasicResBlock(BasicRes):
    expansion=1
    "Basic ResBlock layer, 2 `ConvBatch` layers with no expansion"
    def __init__(self, n_in, n_out, *args, **kwargs):
        super().__init__(n_in, n_out, *args, **kwargs)
        expansion = 1
        
        self.res_blocks = SequentialModel(
            ConvBatch(n_in, n_out, stride=self.stride),
            self.Activation(),
            ConvBatch(n_out, self.n_out*expansion)
        )
        

In [None]:
#export
class BottleneckBlock(BasicRes):
    expansion=4
    "Bottleneck layer, 3 `ConvBatch` layers with an expansion factor of 4"
    def __init__(self, n_in, n_out, *args, **kwargs):
        super().__init__(n_in, n_out, *args, **kwargs)
        
        self.res_blocks = SequentialModel(
            ConvBatch(n_in, n_out, kernel_size=1, stride=1),
            self.Activation(),
            ConvBatch(n_out, n_out),
            self.Activation(),
            ConvBatch(n_out, self.expansion, kernel_size=1)
        )

In [None]:
#export
class ResBlock(NestedModel):
    "Adds the final activation after the skip connection addition"
    def __init__(self, n_in, n_out, block=BasicResBlock, stride=1, kernel_size=3, Activation=ReLU, **kwargs):
        super().__init__()
        self.n_in, self.n_out, self.exp, self.ks, self.stride = n_in, n_out, block.expansion, kernel_size, stride
        self.layers = SequentialModel(block(n_in=n_in, n_out=n_out, expansion=block.expansion, kernel_size=kernel_size, stride=stride, Activation=Activation,**kwargs), 
                                      Activation())
    
    def __repr__(self): return f'ResBlock({self.n_in}, {self.n_out*self.exp}, kernel_size={self.ks}, stride={self.stride})'
    

In [None]:
#export
class ResLayer(NestedModel):
    "Sequential ResBlock layers as outlined in the paper"
    def __init__(self, block, n, n_in, n_out, *args, **kwargs):
        layers = []
        self.block, self.n, self.n_in, self.n_out = block, n, n_in, n_out
        
        downsampling = 2 if n_in != n_out else 1

        layers = [ResBlock(n_in, n_out, block, stride=downsampling),
        *[ResBlock(n_out * block.expansion, n_out, block, stride=1) for i in range(n-1)]]
        
        self.layers = SequentialModel(*layers)
    
    def __repr__(self): return f'ResLayer(\n{self.layers}\n)'

```python
class ResLayer(NestedModel):
    "Sequential res layers"
    def __init__(self, block, n, n_in, n_out, *args, **kwargs):
        layers = []
        self.block, self.n, self.n_in, self.n_out = block, n, n_in, n_out
        
        downsampling = 2 if n_in != n_out else 1

        layers = [ResBlock(n_in, n_out, block, stride=downsampling),
        *[ResBlock(n_out * block.expansion, n_out, block, stride=1) for i in range(n-1)]]
        
        self.layers = SequentialModel(*layers)
    
    def __repr__(self): return f'ResLayer(\n{self.layers}\n)'
    ```

# ResNet

In [None]:
#export
class ResNet(NestedModel):
    "Class to create ResNet architectures of dynamic sizing"
    def __init__(self, block, layer_sizes=[64, 128, 256, 512], depths=[2,2,2,2], c_in=3, 
               c_out=1000, im_size=(28,28), activation=ReLU, *args, **kwargs):
        
        self.layer_sizes = layer_sizes
        
        gate = [
            Reshape(c_in, im_size[0], im_size[1]),
            ConvBatch(c_in, self.layer_sizes[0], stride=2, kernel_size=7),
            activation(),
            Pool(max_pool, ks=3, stride=2, padding=Padding(1))
        ]
        
        self.conv_sizes = list(zip(self.layer_sizes, self.layer_sizes[1:]))
        body = [
            ResLayer(block, depths[0], self.layer_sizes[0], self.layer_sizes[0], Activation=activation, *args, **kwargs),
            *[ResLayer(block, n, n_in * block.expansion, n_out, Activation=activation)
             for (n_in,n_out), n in zip(self.conv_sizes, depths[1:])]
        ]
        
        tail = [
            Pool(avg_pool, ks=1, stride=1, padding=None),
            Flatten(),
            Linear(self.layer_sizes[-1]*block.expansion, c_out, relu_after=False)
        ]

        self.layers = SequentialModel(
            *[layer for layer in gate],
            *[layer for layer in body],
            *[layer for layer in tail]
        )
    
    def __repr__(self): return f'ResNet: \n{self.layers}'

```python
class ResNet(NestedModel):
    "Class to create ResNet architectures of dynamic sizing"
    def __init__(self, block, layer_sizes=[64, 128, 256, 512], depths=[2,2,2,2], c_in=3, 
               c_out=1000, im_size=(28,28), activation=ReLU, *args, **kwargs):
        
        self.layer_sizes = layer_sizes
        
        gate = [
            Reshape(c_in, im_size[0], im_size[1]),
            ConvBatch(c_in, self.layer_sizes[0], stride=2, kernel_size=7),
            activation(),
            Pool(max_pool, ks=3, stride=2, padding=Padding(1))
        ]
        
        self.conv_sizes = list(zip(self.layer_sizes, self.layer_sizes[1:]))
        body = [
            ResLayer(block, depths[0], self.layer_sizes[0], self.layer_sizes[0], Activation=activation, *args, **kwargs),
            *[ResLayer(block, n, n_in * block.expansion, n_out, Activation=activation)
             for (n_in,n_out), n in zip(self.conv_sizes, depths[1:])]
        ]
        
        tail = [
            Pool(avg_pool, ks=1, stride=1, padding=None),
            Flatten(),
            Linear(self.layer_sizes[-1]*block.expansion, c_out, relu_after=False)
        ]

        self.layers = SequentialModel(
            *[layer for layer in gate],
            *[layer for layer in body],
            *[layer for layer in tail]
        )
    
    def __repr__(self): return f'ResNet: \n{self.layers}'
    ```

In [None]:
res = ResNet(BasicResBlock)
res

ResNet: 
(Layer1): Reshape(3, 28, 28)
(Layer2): Conv(3, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 64, kernel_size=3, stride=1)
(Layer2): ResBlock(64, 64, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(64, 128, kernel_size=3, stride=2)
(Layer2): ResBlock(128, 128, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(128, 256, kernel_size=3, stride=2)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer8): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(512, 512, kernel_size=3, stride=1)
)
(Layer9): AveragePool(ks: 1, stride: 1)
(Layer10): Flatten()
(Layer11): Linear(512, 1000)

In [None]:
#export
def GetResnet(size, c_in=3, c_out=10, *args, **kwargs):
    "Helper function to get ResNet architectures of different sizes"
    if size == 18: return ResNet(c_in=c_in, c_out=c_out, block=BasicResBlock, depths=[2, 2, 2, 2], size=size, **kwargs)
    elif size == 34: return ResNet(c_in=c_in, c_out=c_out, block=BasicResBlock, depths=[3, 4, 6, 3], size=size, **kwargs)
    elif size == 50: return ResNet(c_in=c_in, c_out=c_out, block=BottleneckBlock, depths=[3, 4, 6, 3], size=size, **kwargs)
    elif size == 150: return ResNet(c_in=c_in, c_out=c_out, block=BottleneckBlock, depths=[3, 4, 23, 3], size=size, **kwargs)
    elif size == 152: return ResNet(c_in=c_in, c_out=c_out, block=BottleneckBlock, depths=[3, 8, 36, 3], size=size, **kwargs)

Testing out the ResNet Architectures:

In [None]:
GetResnet(18, c_in=1, c_out=10)

ResNet: 
(Layer1): Reshape(1, 28, 28)
(Layer2): Conv(1, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 64, kernel_size=3, stride=1)
(Layer2): ResBlock(64, 64, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(64, 128, kernel_size=3, stride=2)
(Layer2): ResBlock(128, 128, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(128, 256, kernel_size=3, stride=2)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer8): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(512, 512, kernel_size=3, stride=1)
)
(Layer9): AveragePool(ks: 1, stride: 1)
(Layer10): Flatten()
(Layer11): Linear(512, 10)

In [None]:
GetResnet(34, c_in=1, c_out=10)

ResNet: 
(Layer1): Reshape(1, 28, 28)
(Layer2): Conv(1, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 64, kernel_size=3, stride=1)
(Layer2): ResBlock(64, 64, kernel_size=3, stride=1)
(Layer3): ResBlock(64, 64, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(64, 128, kernel_size=3, stride=2)
(Layer2): ResBlock(128, 128, kernel_size=3, stride=1)
(Layer3): ResBlock(128, 128, kernel_size=3, stride=1)
(Layer4): ResBlock(128, 128, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(128, 256, kernel_size=3, stride=2)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer3): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer4): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer5): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer6): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer8): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(51

In [None]:
GetResnet(50, c_in=1, c_out=10)

ResNet: 
(Layer1): Reshape(1, 28, 28)
(Layer2): Conv(1, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 256, kernel_size=3, stride=1)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer3): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer3): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer4): ResBlock(512, 512, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(512, 1024, kernel_size=3, stride=2)
(Layer2): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer3): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer4): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer5): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer6): ResBlock(1024, 1024, kernel_size=3, stride=1)
)
(Layer8): ResLayer(
(Layer1): ResBlock(1024, 2048, kernel_size=3, stride=2)
(L

In [None]:
GetResnet(150, c_in=1, c_out=10)

ResNet: 
(Layer1): Reshape(1, 28, 28)
(Layer2): Conv(1, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 256, kernel_size=3, stride=1)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer3): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer3): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer4): ResBlock(512, 512, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(512, 1024, kernel_size=3, stride=2)
(Layer2): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer3): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer4): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer5): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer6): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer7): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer8): ResBlock(1024,

In [None]:
GetResnet(152, c_in=1, c_out=10)

ResNet: 
(Layer1): Reshape(1, 28, 28)
(Layer2): Conv(1, 64, ks = 7, stride = 2), Batchnorm
(Layer3): ReLU()
(Layer4): MaxPool(ks: 3, stride: 2)
(Layer5): ResLayer(
(Layer1): ResBlock(64, 256, kernel_size=3, stride=1)
(Layer2): ResBlock(256, 256, kernel_size=3, stride=1)
(Layer3): ResBlock(256, 256, kernel_size=3, stride=1)
)
(Layer6): ResLayer(
(Layer1): ResBlock(256, 512, kernel_size=3, stride=2)
(Layer2): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer3): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer4): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer5): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer6): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer7): ResBlock(512, 512, kernel_size=3, stride=1)
(Layer8): ResBlock(512, 512, kernel_size=3, stride=1)
)
(Layer7): ResLayer(
(Layer1): ResBlock(512, 1024, kernel_size=3, stride=2)
(Layer2): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer3): ResBlock(1024, 1024, kernel_size=3, stride=1)
(Layer4): ResBlock(1024, 1024, k

In [None]:
run = get_runner(model=GetResnet(18,c_in=1, c_out=10))