In [3]:
import torch 
import torch.nn as nn
from simple_conv_net_func import diff_mse
from simple_conv_net_func import conv2d_scalar, pool2d_scalar, relu_scalar, reshape_scalar, fc_layer_scalar, conv2d_vector, pool2d_vector, relu_vector, reshape_vector, fc_layer_vector
import torch.nn.functional as F

In [54]:
device = torch.device("cpu")
conv_layer = nn.Conv2d(in_channels=5,
                        out_channels=20,
                        kernel_size=5,
                        stride=1,
                        padding=0,
                        dilation=1,
                        groups=1,
                        bias=True)

# Scalar benchmarks

## 1. Convolution

In [5]:
sample = torch.randn((1, 1, 28, 28))

In [6]:
%%time
t_out = conv_layer(sample)

CPU times: user 1.48 ms, sys: 1.15 ms, total: 2.63 ms
Wall time: 1.27 ms


In [7]:
%%time
c_out = conv2d_scalar(sample, conv_layer.weight, conv_layer.bias, device)

CPU times: user 14.1 s, sys: 914 ms, total: 15 s
Wall time: 15.9 s


In [8]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 4.313246774448245e-15


## 2. Maxpooling

In [9]:
sample = torch.randn((1, 1, 28, 28))

In [10]:
%%time
t_out = F.max_pool2d(sample, 2, 2)

CPU times: user 336 µs, sys: 198 µs, total: 534 µs
Wall time: 295 µs


In [11]:
%%time
c_out = pool2d_scalar(sample, device)

CPU times: user 1.83 s, sys: 1.82 s, total: 3.65 s
Wall time: 6.68 s


In [12]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 3. ReLU

In [13]:
sample = torch.randn((1, 2800))

In [14]:
%%time
t_out = F.relu(sample)

CPU times: user 107 µs, sys: 91 µs, total: 198 µs
Wall time: 266 µs


In [15]:
%%time
c_out = relu_scalar(sample, device)

CPU times: user 135 ms, sys: 7.01 ms, total: 142 ms
Wall time: 346 ms


In [16]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 4. Reshape (Flatten)

In [17]:
sample = torch.randn((1, 1, 28, 28))

In [18]:
%%time
t_out = sample.view(1, -1)

CPU times: user 245 µs, sys: 27 µs, total: 272 µs
Wall time: 286 µs


In [19]:
%%time
c_out = reshape_scalar(sample, device)

CPU times: user 29.2 ms, sys: 2.54 ms, total: 31.7 ms
Wall time: 64.2 ms


In [20]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 5. Linear (Fully connected)

In [21]:
fc_layer = nn.Linear(in_features=500, out_features=100)

In [22]:
sample = torch.randn((1, 500))

In [23]:
%%time
t_out = fc_layer(sample)

CPU times: user 1.68 ms, sys: 4.73 ms, total: 6.41 ms
Wall time: 21 ms


In [24]:
%%time
c_out = fc_layer_scalar(sample, fc_layer.weight, fc_layer.bias, device)

CPU times: user 3.82 s, sys: 369 ms, total: 4.19 s
Wall time: 4.98 s


In [25]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 5.971200774503374e-14


# Vectorized benchmarks

## 1. Convolution

In [55]:
sample = torch.randn((1, 5, 28, 28))

In [56]:
%%time
t_out = conv_layer(sample)

CPU times: user 2.07 ms, sys: 9.11 ms, total: 11.2 ms
Wall time: 20.6 ms


In [57]:
%%time
c_out = conv2d_vector(sample, conv_layer.weight, conv_layer.bias, device)

RuntimeError: The size of tensor a (100) must match the size of tensor b (20) at non-singleton dimension 0

In [29]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 5.039436919249563e-15


## 2. Maxpooling

In [30]:
sample = torch.randn((1, 1, 28, 28))

In [31]:
%%time
t_out = F.max_pool2d(sample, 2, 2)

CPU times: user 624 µs, sys: 1.19 ms, total: 1.81 ms
Wall time: 2.26 ms


In [32]:
%%time
c_out = pool2d_vector(sample, device)

CPU times: user 16.8 ms, sys: 2.67 ms, total: 19.4 ms
Wall time: 39.1 ms


In [33]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 3. ReLU

In [34]:
sample = torch.randn((1, 2800))

In [35]:
%%time
t_out = F.relu(sample)

CPU times: user 141 µs, sys: 27 µs, total: 168 µs
Wall time: 113 µs


In [36]:
%%time
c_out = relu_vector(sample, device)

CPU times: user 3.59 ms, sys: 7.38 ms, total: 11 ms
Wall time: 16.4 ms


In [37]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 4. Reshape (Flatten)

In [38]:
sample = torch.randn((1, 1, 28, 28))

In [39]:
%%time
t_out = sample.view(1, -1)

CPU times: user 159 µs, sys: 177 µs, total: 336 µs
Wall time: 206 µs


In [40]:
%%time
c_out = reshape_vector(sample, device)

CPU times: user 570 µs, sys: 373 µs, total: 943 µs
Wall time: 628 µs


In [41]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 5. Linear (Fully connected)

In [42]:
fc_layer = nn.Linear(in_features=500, out_features=100)

In [43]:
sample = torch.randn((1, 500))

In [49]:
%%time
t_out = fc_layer(sample)

CPU times: user 366 µs, sys: 211 µs, total: 577 µs
Wall time: 288 µs


In [50]:
%%time
c_out = fc_layer_vector(sample, fc_layer.weight, fc_layer.bias, device)

CPU times: user 822 µs, sys: 508 µs, total: 1.33 ms
Wall time: 800 µs


In [51]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0
