In [2]:
import random
import numpy as np
from PIL import Image

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Normalize

from data_generation.image_classification import generate_dataset
from helpers import index_splitter, make_balanced_sampler
from stepbystep.v1 import StepByStep


2024-02-19 21:30:02.725739: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 21:30:02.767461: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 21:30:02.767535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 21:30:02.768558: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-19 21:30:02.774088: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-19 21:30:02.774554: I tensorflow/core/platform/cpu_feature_guard.cc:1

## Convolution

In [3]:
single=np.array([[[[5,0,8,7,8,1],
                   [1,9,5,0,7,7],
                   [6,0,2,4,6,6],
                   [9,7,6,6,8,4],
                   [8,3,8,5,1,3],
                   [7,2,7,0,1,0]]]])

single.shape

(1, 1, 6, 6)

In [29]:
identity=np.array([[0,0,0],
                  [0,1,0],
                  [0,0,0]])
identity=identity.reshape(1,1,3,3)
identity.shape

(1, 1, 3, 3)

### Convolving (Applying filters)

In [33]:
region=single[:,:,:3,:3] # NCHW shape

filtered_region=region*identity
total=filtered_region.sum()
total

9

The size of the movement in pixels called `stride`

In [37]:
new_region=single[:,:,:3,1:4]
new_filtered_region=new_region*identity
new_total=new_filtered_region.sum()
new_total

5

The larger the filter, smaller the resulting image  

$(h_i,w_i)*(h_f,w_f)=(h_i-(h_f-1),w_i-(w_f-1))$
$\\(h_i,w_i)*f=(h_i-f+1,w_i-f+1)$


## Convolving in Pytorch

In [39]:
image=torch.as_tensor(single).float()
kernel_identity=torch.as_tensor(identity).float()

Functional convolution

In [40]:
convolved=F.conv2d(image,kernel_identity,stride=1)
convolved

tensor([[[[9., 5., 0., 7.],
          [0., 2., 4., 6.],
          [7., 6., 6., 8.],
          [3., 8., 5., 1.]]]])

Convolutional module: Learn kernel/filter on its own

In [41]:
conv=nn.Conv2d(in_channels=1,out_channels=1,kernel_size=3,stride=1)
conv(image)

tensor([[[[-3.9563, -4.4234, -4.0027, -4.8673],
          [-2.3975, -2.7985, -3.7731, -3.8462],
          [-1.2606, -2.0356, -1.1159, -3.8124],
          [-2.8716, -4.5506, -2.9141, -2.4178]]]],
       grad_fn=<ConvolutionBackward0>)

Learn multiple filters at once


In [43]:
conv_multiple=nn.Conv2d(in_channels=1,out_channels=2,kernel_size=3,stride=1)
conv_multiple.weight

Parameter containing:
tensor([[[[-0.0174,  0.1963, -0.1496],
          [ 0.0023, -0.0006,  0.0282],
          [ 0.0517,  0.1776, -0.2224]]],


        [[[-0.1472,  0.2946,  0.1234],
          [ 0.1434, -0.1047,  0.1713],
          [ 0.0069,  0.3052,  0.0946]]]], requires_grad=True)

Use convolutional module to use particular weights

In [44]:
with torch.no_grad():
    conv.weight[0]=kernel_identity
    conv.bias[0]=0

conv(image)

tensor([[[[9., 5., 0., 7.],
          [0., 2., 4., 6.],
          [7., 6., 6., 8.],
          [3., 8., 5., 1.]]]], grad_fn=<ConvolutionBackward0>)

$(h_i,w_i)*f=(\frac{h_i-f+1}{s},\frac{w_i-f+1}{s})$

In [45]:
convolution_stride2=F.conv2d(image,kernel_identity,stride=2)
convolution_stride2

tensor([[[[9., 0.],
          [7., 6.]]]])

## Padding; to preserve original size of the image after convolution
Expand the input image: Add zero rows and columns around the image

In [51]:
# symmetric padding
constant_padder=nn.ConstantPad2d(padding=1,value=0.0) # padding: num of columns and rows to be stuffed, value: value that filling the new cols and rows
constant_padder(image)

tensor([[[[0., 0., 0., 0., 0., 0., 0., 0.],
          [0., 5., 0., 8., 7., 8., 1., 0.],
          [0., 1., 9., 5., 0., 7., 7., 0.],
          [0., 6., 0., 2., 4., 6., 6., 0.],
          [0., 9., 7., 6., 6., 8., 4., 0.],
          [0., 8., 3., 8., 5., 1., 3., 0.],
          [0., 7., 2., 7., 0., 1., 0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0.]]]])

In [50]:
constant_padder(image).shape

torch.Size([1, 1, 8, 8])

In [52]:
# asymetric padding(change pad)
asy_padded=F.pad(image,pad=(1,1,1,1),mode='constant',value=0) # pad=(left,right, top, bottom)
asy_padded

tensor([[[[0., 0., 0., 0., 0., 0., 0., 0.],
          [0., 5., 0., 8., 7., 8., 1., 0.],
          [0., 1., 9., 5., 0., 7., 7., 0.],
          [0., 6., 0., 2., 4., 6., 6., 0.],
          [0., 9., 7., 6., 6., 8., 4., 0.],
          [0., 8., 3., 8., 5., 1., 3., 0.],
          [0., 7., 2., 7., 0., 1., 0., 0.],
          [0., 0., 0., 0., 0., 0., 0., 0.]]]])

Other padding modes: replicate, reflect, circular

In [54]:
replication_padder=nn.ReplicationPad2d(padding=1)
replication_padder(image)

tensor([[[[5., 5., 0., 8., 7., 8., 1., 1.],
          [5., 5., 0., 8., 7., 8., 1., 1.],
          [1., 1., 9., 5., 0., 7., 7., 7.],
          [6., 6., 0., 2., 4., 6., 6., 6.],
          [9., 9., 7., 6., 6., 8., 4., 4.],
          [8., 8., 3., 8., 5., 1., 3., 3.],
          [7., 7., 2., 7., 0., 1., 0., 0.],
          [7., 7., 2., 7., 0., 1., 0., 0.]]]])

In [55]:
reflection_padder=nn.ReflectionPad2d(padding=1)
reflection_padder(image)

tensor([[[[9., 1., 9., 5., 0., 7., 7., 7.],
          [0., 5., 0., 8., 7., 8., 1., 8.],
          [9., 1., 9., 5., 0., 7., 7., 7.],
          [0., 6., 0., 2., 4., 6., 6., 6.],
          [7., 9., 7., 6., 6., 8., 4., 8.],
          [3., 8., 3., 8., 5., 1., 3., 1.],
          [2., 7., 2., 7., 0., 1., 0., 1.],
          [3., 8., 3., 8., 5., 1., 3., 1.]]]])

In [56]:
circular_padding=nn.CircularPad2d(padding=1)
circular_padding(image)

tensor([[[[0., 7., 2., 7., 0., 1., 0., 7.],
          [1., 5., 0., 8., 7., 8., 1., 5.],
          [7., 1., 9., 5., 0., 7., 7., 1.],
          [6., 6., 0., 2., 4., 6., 6., 6.],
          [4., 9., 7., 6., 6., 8., 4., 9.],
          [3., 8., 3., 8., 5., 1., 3., 8.],
          [0., 7., 2., 7., 0., 1., 0., 7.],
          [1., 5., 0., 8., 7., 8., 1., 5.]]]])

$(h_i,w_i)*f=(\frac{h_i+2p-f+1}{s},\frac{w_i+2p-f+1}{s})$

In [57]:
edge=np.array([[[[0,1,0],
                 [1,-4,1],
                 [0,1,0]]]])

kernel_edge=torch.as_tensor(edge).float()
kernel_edge.shape

torch.Size([1, 1, 3, 3])

In [58]:
padded_image=F.pad(image,pad=(1,1,1,1),mode='constant',value=0.0)
conv_padded=F.conv2d(input=padded_image,weight=kernel_edge,stride=1)
conv_padded

tensor([[[[-19.,  22., -20., -12., -17.,  11.],
          [ 16., -30.,  -1.,  23.,  -7., -14.],
          [-14.,  24.,   7.,  -2.,   1.,  -7.],
          [-15., -10.,  -1.,  -1., -15.,   1.],
          [-13.,  13., -11.,  -5.,  13.,  -7.],
          [-18.,   9., -18.,  13.,  -3.,   4.]]]])

## Pooling: Shrinking images

In [59]:
pooled=F.max_pool2d(input=conv_padded,kernel_size=2)
pooled

tensor([[[[22., 23., 11.],
          [24.,  7.,  1.],
          [13., 13., 13.]]]])

In [60]:
# 4x4 pooling
maxpool4=nn.MaxPool2d(kernel_size=4)
maxpool4(conv_padded)

tensor([[[[24.]]]])

In [61]:
F.max_pool2d(input=conv_padded,kernel_size=3,stride=1)

tensor([[[[24., 24., 23., 23.],
          [24., 24., 23., 23.],
          [24., 24., 13., 13.],
          [13., 13., 13., 13.]]]])

## Flattening

In [64]:
flattened=nn.Flatten(1,-1)(pooled)
flattened

tensor([[22., 23., 11., 24.,  7.,  1., 13., 13., 13.]])

In [66]:
pooled.view(1,-1)

tensor([[22., 23., 11., 24.,  7.,  1., 13., 13., 13.]])

## Typical Architecture  

**Typical Convolutional block**: Preprocessing images and coverting them into features
1. Convolution
2. Activation function
3. Pooling

In [73]:
# LeNet-5
lenet=nn.Sequential()

# Featurizer
# block 1: 1@28x28-->6@28x28-->6@14x14
lenet.add_module('conv2d1',nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5,padding=2))
lenet.add_module('activation1',nn.ReLU())
lenet.add_module('maxpool2d1',nn.MaxPool2d(kernel_size=2))

# block 2: 6@14x14-->16@10x10-->16@5x5
lenet.add_module('conv2d2',nn.Conv2d(in_channels=6,out_channels=16,kernel_size=5))
lenet.add_module('activation2',nn.ReLU())
lenet.add_module('maxpool2d2',nn.MaxPool2d(kernel_size=2))

# block 3: 16@5x5-->120@1x1
lenet.add_module('conv2d3',nn.Conv2d(in_channels=16,out_channels=120,kernel_size=5))
lenet.add_module('activation3',nn.ReLU())
lenet.add_module('flatten',nn.Flatten())

# Classification
# Hidden layer
lenet.add_module('linear1',nn.Linear(in_features=120,out_features=84))
# output layer
lenet.add_module('linear2',nn.Linear(in_features=84,out_features=10))

In [74]:
lenet

Sequential(
  (conv2d1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (activation1): ReLU()
  (maxpool2d1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (activation2): ReLU()
  (maxpool2d2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2d3): Conv2d(16, 120, kernel_size=(5, 5), stride=(1, 1))
  (activation3): ReLU()
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=120, out_features=84, bias=True)
  (linear2): Linear(in_features=84, out_features=10, bias=True)
)

## Multiclass Classification