In [58]:
import torch 
import torch.nn as nn
from simple_conv_net_func import diff_mse
from simple_conv_net_func import conv2d_scalar, pool2d_scalar, relu_scalar, reshape_scalar, fc_layer_scalar, conv2d_vector, pool2d_vector, relu_vector, reshape_vector, fc_layer_vector
import torch.nn.functional as F

In [59]:
device = torch.device("cpu")
conv_layer = nn.Conv2d(in_channels=1,
                        out_channels=20,
                        kernel_size=5,
                        stride=1,
                        padding=0,
                        dilation=1,
                        groups=1,
                        bias=True)

# Scalar benchmarks

## 1. Convolution

In [60]:
sample = torch.randn((1, 1, 28, 28))

In [61]:
%%time
t_out = conv_layer(sample)

CPU times: user 1.95 ms, sys: 6.62 ms, total: 8.57 ms
Wall time: 11.2 ms


In [62]:
%%time
c_out = conv2d_scalar(sample, conv_layer.weight, conv_layer.bias, device)

CPU times: user 13.7 s, sys: 890 ms, total: 14.5 s
Wall time: 15.7 s


In [63]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 4.2098744500488564e-15


## 2. Maxpooling

In [64]:
sample = torch.randn((1, 1, 28, 28))

In [65]:
%%time
t_out = F.max_pool2d(sample, 2, 2)

CPU times: user 523 µs, sys: 4.14 ms, total: 4.66 ms
Wall time: 26.2 ms


In [66]:
%%time
c_out = pool2d_scalar(sample, device)

CPU times: user 1.28 s, sys: 340 ms, total: 1.62 s
Wall time: 1.71 s


In [67]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 3. ReLU

In [68]:
sample = torch.randn((1, 2800))

In [69]:
%%time
t_out = F.relu(sample)

CPU times: user 224 µs, sys: 461 µs, total: 685 µs
Wall time: 1.01 ms


In [70]:
%%time
c_out = relu_scalar(sample, device)

CPU times: user 110 ms, sys: 3.79 ms, total: 113 ms
Wall time: 123 ms


In [71]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 4. Reshape (Flatten)

In [72]:
sample = torch.randn((1, 1, 28, 28))

In [73]:
%%time
t_out = sample.view(1, -1)

CPU times: user 107 µs, sys: 2 µs, total: 109 µs
Wall time: 123 µs


In [74]:
%%time
c_out = reshape_scalar(sample, device)

CPU times: user 33.1 ms, sys: 4.25 ms, total: 37.3 ms
Wall time: 42.4 ms


In [75]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 5. Linear (Fully connected)

In [76]:
fc_layer = nn.Linear(in_features=500, out_features=100)

In [77]:
sample = torch.randn((1, 500))

In [78]:
%%time
t_out = fc_layer(sample)

CPU times: user 2.99 ms, sys: 7.16 ms, total: 10.2 ms
Wall time: 11 ms


In [79]:
%%time
c_out = fc_layer_scalar(sample, fc_layer.weight, fc_layer.bias, device)

CPU times: user 3.15 s, sys: 224 ms, total: 3.37 s
Wall time: 3.59 s


In [80]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 3.754768306170331e-14


# Vectorized benchmarks

## 1. Convolution

In [81]:
sample = torch.randn((1, 1, 28, 28))

In [82]:
%%time
t_out = conv_layer(sample)

CPU times: user 1.22 ms, sys: 1.14 ms, total: 2.35 ms
Wall time: 2.61 ms


In [83]:
%%time
c_out = conv2d_vector(sample, conv_layer.weight, conv_layer.bias, device)

CPU times: user 594 ms, sys: 239 ms, total: 832 ms
Wall time: 899 ms


In [84]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 4.22367388730905e-15


## 2. Maxpooling

In [85]:
sample = torch.randn((1, 1, 28, 28))

In [86]:
%%time
t_out = F.max_pool2d(sample, 2, 2)

CPU times: user 324 µs, sys: 40 µs, total: 364 µs
Wall time: 570 µs


In [87]:
%%time
c_out = pool2d_vector(sample, device)

CPU times: user 19.2 ms, sys: 4.5 ms, total: 23.7 ms
Wall time: 38.5 ms


In [88]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 3. ReLU

In [89]:
sample = torch.randn((1, 2800))

In [90]:
%%time
t_out = F.relu(sample)

CPU times: user 91 µs, sys: 9 µs, total: 100 µs
Wall time: 111 µs


In [91]:
%%time
c_out = relu_vector(sample, device)

CPU times: user 276 µs, sys: 88 µs, total: 364 µs
Wall time: 514 µs


In [92]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 4. Reshape (Flatten)

In [93]:
sample = torch.randn((1, 1, 28, 28))

In [94]:
%%time
t_out = sample.view(1, -1)

CPU times: user 60 µs, sys: 4 µs, total: 64 µs
Wall time: 91.8 µs


In [95]:
%%time
c_out = reshape_vector(sample, device)

CPU times: user 95 µs, sys: 18 µs, total: 113 µs
Wall time: 122 µs


In [96]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


## 5. Linear (Fully connected)

In [97]:
fc_layer = nn.Linear(in_features=500, out_features=100)

In [98]:
sample = torch.randn((1, 500))

In [99]:
%%time
t_out = fc_layer(sample)

CPU times: user 293 µs, sys: 137 µs, total: 430 µs
Wall time: 341 µs


In [100]:
%%time
c_out = fc_layer_vector(sample, fc_layer.weight, fc_layer.bias, device)

CPU times: user 1.67 ms, sys: 2.48 ms, total: 4.15 ms
Wall time: 4.25 ms


In [101]:
print(f"Difference {diff_mse(t_out, c_out)}")

Difference 0.0


In [None]:
gi