In [0]:
from __future__ import print_function
import argparse
import torch
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import time
import torch.nn as nn


import numpy as np


In [0]:
class CNNScalarForm(nn.Module):

    def __init__(self, check_with_pytorch=False):
        super(CNNScalarForm, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.fc1 = nn.Linear(12 * 12 * 20, 500)
        self.fc2 = nn.Linear(500, 10)

        self.check_with_pytorch = check_with_pytorch

    def forward(self, x):
        if self.check_with_pytorch:
            return self._forward_with_cmp(x)

        return self._forward(x)


    def _forward(self, x):
        z_conv = self._conv(x, self.conv1.weight, self.conv1.bias)
        z_pool = self._max_pool_kernel2(z_conv)

        z_reshape = self._reshape(z_pool)

        z_fc1 = self._fc(z_reshape, self.fc1.weight, self.fc1.bias)

        z_relu = self._relu(z_fc1)

        z_fc2 = self._fc(z_relu, self.fc2.weight, self.fc2.bias)

        z_softmax = z_fc2.softmax(dim=1)

        return z_softmax

    def _forward_with_cmp(self, x):

        z_conv = self._conv(x, self.conv1.weight, self.conv1.bias)
        z_conv_target = self.conv1(x)
        mse = F.mse_loss(z_conv, z_conv_target)
        print(f"Check conv. MSE: {mse}")

        z_pool = self._max_pool_kernel2(z_conv)
        z_pool_target = F.max_pool2d(z_conv_target, 2, 2)
        mse = F.mse_loss(z_pool, z_pool_target)
        print(f"Check max pool. MSE: {mse}")

        z_reshape = self._reshape(z_pool)
        z_reshape_target = z_pool_target.view(-1, 12 * 12 * 20)
        mse = F.mse_loss(z_reshape, z_reshape_target)
        print(f"Check reshape. MSE: {mse}")

        z_fc1 = self._fc(z_reshape, self.fc1.weight, self.fc1.bias)
        z_fc1_target = self.fc1(z_reshape_target)
        mse = F.mse_loss(z_fc1, z_fc1_target)
        print(f"Check fc1. MSE: {mse}")

        z_relu = self._relu(z_fc1)
        z_relu_target = F.relu(z_fc1_target)
        mse = F.mse_loss(z_relu, z_relu_target)
        print(f"Check relu. MSE: {mse}")


        z_fc2 = self._fc(z_relu, self.fc2.weight, self.fc2.bias)
        z_fc2_target = self.fc2(z_relu_target)
        mse = F.mse_loss(z_fc2, z_fc2_target)
        print(f"Check fc2. MSE: {mse}")


        z_softmax = z_fc2.softmax(dim=1)
        z_softmax_target = z_fc2_target.softmax(dim=1)
        mse = F.mse_loss(z_softmax, z_softmax_target)
        print(f"Check softmax. MSE: {mse}")

        return z_softmax


    def _conv(self, A, W, bias):
        n_batch = A.shape[0]
        s_in = A.shape[2]

        c_out = W.shape[0]
        k = W.shape[2]

        s_out = s_in - k + 1

        z = np.zeros((n_batch, c_out, s_out, s_out))

        for n in range(0, n_batch):
            for c in range(0, c_out):
                for m in range(0, s_out):
                    for l in range(0, s_out):
                        z[n, c, m, l] = self._conv_helper(A[n], W[c], m, l) + bias[c]


        Z = torch.as_tensor(z, dtype=torch.float32)
        Z.requires_grad_()

        return Z.cuda()

    def _max_pool_kernel2(self, x):
        n_batch = x.shape[0]
        b = x.shape[1]
        s_conv = x.shape[2]

        s_pool = s_conv // 2

        z = np.zeros((n_batch, b, s_pool, s_pool))

        for n in range(0, n_batch):
            for c in range(0, b):
                for m in range(0, s_pool):
                    for l in range(0, s_pool):
                        z[n, c, m, l] = torch.max(torch.tensor([x[n][c][2 * m][2 * l], x[n][c][2 * m][2 * l + 1],
                                            x[n][c][2 * m + 1][2 * l], x[n][c][2 * m + 1][2 * l + 1]]))

        Z = torch.as_tensor(z, dtype=torch.float32)
        Z.requires_grad_()
        return Z.cuda()

    def _reshape(self, x):
        n_batch = x.shape[0]
        b = x.shape[1]
        s_pool = x.shape[2]

        n_reshape = b * s_pool * s_pool

        z = np.zeros((n_batch, n_reshape))

        for n in range(0, n_batch):
            for c in range(0, b):
                for m in range(0, s_pool):
                    for l in range(0, s_pool):
                        j = c * s_pool * s_pool + m * s_pool + l

                        z[n][j] = x[n][c][m][l]

        Z = torch.as_tensor(z, dtype=torch.float32)
        Z.requires_grad_()

        return Z.cuda()

    def _fc(self, x, weight, bias):
        n_batch = x.shape[0]
        d = x.shape[1]

        p = weight.shape[0]

        z = np.zeros((n_batch, p))

        for n in range(0, n_batch):
            for j in range(0, p):
                sum = 0
                for i in range(0, d):
                    sum += weight[j][i] * x[n][i]

                z[n][j] = sum + bias[j]

        Z = torch.as_tensor(z, dtype=torch.float32)
        Z.requires_grad_()

        return Z.cuda()

    def _relu(self, x):
        n_batch = x.shape[0]
        p = x.shape[1]

        z = np.zeros((n_batch, p))
        for n in range(0, n_batch):
            for i in range(0, p):
                z[n, i] = max(0, x[n, i])

        Z = torch.as_tensor(z, dtype=torch.float32)
        Z.requires_grad_()
        return Z.cuda()

    def _conv_helper(self, x_n, w_c_out, m, l):
        c_in = x_n.shape[0]
        k = w_c_out.shape[1]

        value = 0
        for c in range(0, c_in):
            for i in range(0, k):
                for j in range(0, k):
                    value += x_n[c, m + i, l + j] * w_c_out[c, i, j]
        return value


In [0]:
class CNNVectorForm(nn.Module):

    def __init__(self, check_with_pytorch):
        super(CNNVectorForm, self).__init__()
        self.conv1 = nn.Conv2d(1, 20, 5, 1)
        self.fc1 = nn.Linear(12 * 12 * 20, 500)
        self.fc2 = nn.Linear(500, 10)

        self.check_with_pytorch = check_with_pytorch


    def forward(self, x):
        if self.check_with_pytorch:
            return self._forward_with_cmp(x)

        return self._forward(x)


    def _forward(self, x):
        z_conv = self._conv2d_vector_bonus(x, self.conv1.weight, self.conv1.bias)
        z_pool = self._max_pool(z_conv, 2, 2)

        z_reshape = z_pool.view(z_pool.shape[0], z_pool.shape[1] * z_pool.shape[2] * z_pool.shape[3])

        z_fc1 = self._fc(z_reshape, self.fc1.weight, self.fc1.bias)

        z_relu = z_fc1.relu()

        z_fc2 = self._fc(z_relu, self.fc2.weight, self.fc2.bias)

        z_softmax = z_fc2.softmax(dim=1)

        return z_softmax


    def _forward_with_cmp(self, x):
        z_conv = self._conv2d_vector_bonus(x, self.conv1.weight, self.conv1.bias)
        z_conv_target =  self.conv1(x)
        mse = F.mse_loss(z_conv, z_conv_target)
        print(f"Check conv. MSE: {mse}")

        z_pool = self._max_pool(z_conv, 2, 2)
        z_pool_target = F.max_pool2d(z_conv_target, 2, 2)
        mse = F.mse_loss(z_pool, z_pool_target)
        print(f"Check max pool. MSE: {mse}")

        z_reshape = z_pool.view(z_pool.shape[0], z_pool.shape[1] * z_pool.shape[2] * z_pool.shape[3])
        z_reshape_target = z_pool_target.view(-1, 12 * 12 * 20)
        mse = F.mse_loss(z_reshape, z_reshape_target)
        print(f"Check reshape. MSE: {mse}")

        z_fc1 = self._fc(z_reshape, self.fc1.weight, self.fc1.bias)
        z_fc1_target = self.fc1(z_reshape_target)
        mse = F.mse_loss(z_fc1, z_fc1_target)
        print(f"Check fc1. MSE: {mse}")

        z_relu = z_fc1.relu()
        z_relu_target = F.relu(z_fc1_target)
        mse = F.mse_loss(z_relu, z_relu_target)
        print(f"Check relu. MSE: {mse}")


        z_fc2 = self._fc(z_relu, self.fc2.weight, self.fc2.bias)
        z_fc2_target = self.fc2(z_relu_target)
        mse = F.mse_loss(z_fc2, z_fc2_target)
        print(f"Check fc2. MSE: {mse}")


        z_softmax = z_fc2.softmax(dim=1)
        z_softmax_target = z_fc2_target.softmax(dim=1)
        mse = F.mse_loss(z_softmax, z_softmax_target)
        print(f"Check softmax. MSE: {mse}")

        return z_softmax



    def _im2col_KK(self, x, k, s):
        c_in = x.shape[0]
        s_in = x.shape[1]

        s_out = (s_in - k) // s + 1

        h_col = k * k
        w_col = c_in * s_out * s_out

        X_col = torch.zeros(h_col, w_col, requires_grad=True).cuda()
        i = 0
        for c in range(0, c_in):
            for m in range(0, s_in - k + 1, s):
                for l in range(0, s_in - k + 1, s):
                    x_col_i = self._mat2row(x[c, m:m + k, l:l + k])
                    X_col[:, i] = x_col_i
                    i += 1

        return X_col

    def _weight2row_kk(self, wc_conv):
        c_out = wc_conv.shape[0]
        k = wc_conv.shape[1]

        wc_row_conv = torch.zeros(c_out, k * k, requires_grad=True).cuda()
        for j in range(0, c_out):
            wc_row_j = self._mat2row(wc_conv[j, :, :])
            wc_row_conv[j, :] = wc_row_j

        return wc_row_conv

    def _max_pool(self, A, k, s):
        n_batch = A.shape[0]
        c_in = A.shape[1]
        s_in = A.shape[2]

        s_out = (s_in - k) // s + 1

        Z = torch.zeros(n_batch, c_in * s_out * s_out, requires_grad=True).cuda()
        for n in range(0, n_batch):
            A_col_n = self._im2col_KK(A[n], k, s)

            Z[n] = A_col_n.t().max(1).values

        Z_pool = Z.view(n_batch, c_in, s_out, s_out)

        return Z_pool

    # bonus fun

    def _conv2d_vector_bonus(self, A, W, bias, s=1):
        n_btach = A.shape[0]
        c_in = A.shape[1]
        s_in = A.shape[2]

        c_out = W.shape[0]
        k = W.shape[2]

        s_out = s_out = (s_in - k) // s + 1

        B = torch.zeros(c_out, s_out, s_out, requires_grad=True).cuda()
        for i in range(0, c_out):
            B[i] = bias[i]
          

        Z = torch.zeros(n_btach, c_out, s_out, s_out, requires_grad=True).cuda()

        for n in range(0, n_btach):
            W_row_n = self._weight2row_bonus(W)
            A_col_n = self._im2col_bonus(A[n], k, s)

            O_mat_n = torch.mm(W_row_n, A_col_n).cuda()

            O_n = O_mat_n.view((c_out, s_out, s_out))
            Z_n = O_n + B

            Z[n] = Z_n

        return Z

    def _weight2row_bonus(self, W):
        c_out = W.shape[0]  # num of filters
        c_in = W.shape[1]
        k = W.shape[2]

        W_row = torch.zeros(c_out, c_in * k * k, requires_grad=True).cuda()
        for j in range(0, c_out):
            W_row[j, :] = self._tensor2row(W[j, :, :, :])

        return W_row

    def _im2col_bonus(self, X, k, s):
        c_in = X.shape[0]
        s_in = X.shape[1]

        s_out = (s_in - k) // s + 1

        h_col = c_in * k * k
        w_col = s_out * s_out

        X_col = torch.zeros(h_col, w_col, requires_grad=True).cuda()

        i = 0
        for m in range(0, s_in - k + 1, s):
            for l in range(0, s_in - k + 1, s):
                X_col[:, i] = self._tensor2row(X[:, m:m + k, l:l + k])
                i += 1

        return X_col

    def _tensor2row(self, R):
        r = self._tensor2col(R).t().cuda()
        return r

    def _tensor2col(self, C):
        N = C.shape[0]
        M = C.shape[1]
        L = C.shape[2]

        c = torch.zeros(N * M * L, 1, requires_grad=True).cuda()

        for j in range(0, N):
            for i in range(0, M):
                for k in range(0, L):
                    t = j * M * L + i * L + k
                    c[t] = C[j, i, k]

        return c

    def _mat2row(self, C):

        N = C.shape[0]
        M = C.shape[1]

        # TODO: think about it
        # c = C.view(N * M, 1)

        c = torch.zeros(M * N, requires_grad=True).cuda()

        for j in range(0, N):
            for i in range(0, M):
                t = j * M + i

                c[t] = C[j][i]

        return c

    def _fc(self, A, W, bias):
        n_batch = A.shape[0]

        ones = torch.ones(n_batch, requires_grad=True).view(n_batch).tolist()

        A_temp = A.t().tolist()
        A_temp.append(ones)
        A_new = torch.tensor(A_temp, requires_grad=True).t().cuda()

        W_temp = W.t().tolist()
        W_temp.append(bias.tolist())
        W_new = torch.tensor(W_temp, requires_grad=True).cuda()

        Z = torch.mm(A_new, W_new)

        return Z

In [0]:
def train(log_interval, model, device, train_loader, optimizer, epoch, num_iter=None):
    model.train()
    i = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))
            
        i += 1
        if num_iter is not None and i >= num_iter:
            break


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


#### Parameters

In [0]:
use_cuda = True
device = torch.device("cuda" if use_cuda else "cpu")
torch.manual_seed(1)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
batch_size = 64

#### Load dataset

In [0]:
def get_train_loader(batch_size):
    train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])), batch_size=batch_size, shuffle=True, **kwargs)
    
    return train_loader



In [0]:
def get_test_loader(batch_size):
    test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])),
        batch_size=batch_size, shuffle=True, **kwargs)
    
    return test_loader


### CNN scalar form model

Let's check the results after each step in forwards pass for cnn implemented by scalar form. The results are compared with pytorch. To avoid weasting time, there is only 1 epoch and batch_size = 1

In [0]:
model = CNNScalarForm(True).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [9]:
train_loader = get_train_loader(1)
test_loader = get_test_loader(1)

0it [00:00, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../data/MNIST/raw/train-images-idx3-ubyte.gz


9920512it [00:01, 8474318.92it/s]                            


Extracting ../data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../data/MNIST/raw/train-labels-idx1-ubyte.gz


32768it [00:00, 127575.34it/s]           
  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ../data/MNIST/raw/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../data/MNIST/raw/t10k-images-idx3-ubyte.gz


1654784it [00:00, 2128446.17it/s]                           
0it [00:00, ?it/s]

Extracting ../data/MNIST/raw/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz


8192it [00:00, 49247.65it/s]            


Extracting ../data/MNIST/raw/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [10]:
start = time.time()
train(1, model, device, train_loader, optimizer, 1, 1)
end = time.time()
print(f"Time executing: {end - start} s")

Check conv. MSE: 4.935583056619661e-15
Check max pool. MSE: 5.617816426623217e-15
Check reshape. MSE: 5.617816426623217e-15
Check fc1. MSE: 2.2707126435227154e-13
Check relu. MSE: 1.0454752162733158e-13
Check fc2. MSE: 5.562772668172254e-14
Check softmax. MSE: 5.162537170386096e-16
Time executing: 58.06030225753784 s


### CNN vector form model

In [0]:
model = CNNVectorForm(True).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

Compare the result after each step in forward with pytorch

In [12]:
start = time.time()
train(1, model, device, train_loader, optimizer, 1, 1)
end = time.time()
print(f"Time executing: {end - start} s")

Check conv. MSE: 0.0
Check max pool. MSE: 0.0
Check reshape. MSE: 0.0
Check fc1. MSE: 3.217560105747393e-15
Check relu. MSE: 1.8387514038773704e-15
Check fc2. MSE: 7.618905718248374e-16
Check softmax. MSE: 8.88178432935015e-17
Time executing: 1.162224292755127 s


As we can see vector form implementation works quickly significant than scalar form (1.2 s <58.8 s) (which was expetced) for batch_size=1. 

### Train CNN based on vector form on 20 epoch

In [0]:
model = CNNVectorForm(False).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

In [0]:
batch_size = 64
train_loader = get_train_loader(batch_size)
test_loader = get_test_loader(batch_size)

In [15]:
start = time.time()

for epoch in range(1, 21):
    train(10, model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)
end = time.time()
print(f"Time executing: {end - start} s")



KeyboardInterrupt: ignored