In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler

In [2]:
X, y = load_digits(return_X_y=True)
y = np.eye(10)[y].astype(np.float32)
X = StandardScaler().fit_transform(X)
X = X.reshape([1797, 1, 8, 8])

In [3]:
x_n = X.copy()
t_n = y.copy()[:, None, :]

In [4]:
class Conv:
    def __init__(self, channels_in, channels_out, k):
        
        self.linear = Linear(channels_in * k ** 2, channels_out)
        self.k = k
        self.padding = 0
        self.channels_out = channels_out
        
        self.x = None
        self.out_shape = None
        self.channels_in = channels_in
        
    def forward(self, x):
        assert x.shape[1] == self.channels_in
        assert len(x.shape) == 4
        assert x.shape[-2] == x.shape[-1]
        
        self.x = x
        s = x.shape
        k = self.k        
        slices = []
        for i in range(s[-1] - k + 1):
            for j in range(s[-1] - k + 1):
                slices.append(np.reshape(x[..., i: i + k, j: j + k], [s[0], -1]))
        stacked = np.stack(slices, axis=1)   
        a = self.linear.forward(stacked)
        d_out = s[-1] + 2 * self.padding - (k - 1)
        a = np.reshape(a, [s[0], self.channels_out, d_out, d_out])
        self.out_shape = a.shape
        return a
    
    def backward(self, d):
        assert d.shape == self.out_shape
        k = self.k
        d = d.reshape(self.linear.out_shape)
        d = self.linear.backward(d)
        
                
        d_dx = np.zeros(self.x.shape)

        for i in range(d_dx.shape[-2] - k + 1):
            for j in range(d_dx.shape[-1] - k + 1):        
                col_ix = i * (d_dx.shape[-2] - k + 1) + j
                d_dx[:, :, i: i + k, j: j + k] += np.reshape(d[:, col_ix], [d_dx.shape[0], d_dx.shape[1], k, k])
        
        return d_dx
    
    
    def step(self, lr):
        self.linear.step(lr)
        
        

In [5]:
class Linear:
    def __init__(self, n_in, n_out):
        self.w = np.random.randn(n_in, n_out) * 0.1
#         self.w = np.linspace(-0.5, 1, n_in * n_out).reshape([n_in, n_out])
        self.b = np.zeros([n_out])
        
        self.dw = None
        self.db = None
        
        self.x = None
        
        self.out_shape = None
    
    def forward(self, x):
        assert len(x.shape) == 3
        assert len(self.w.shape) == 2
        assert len(self.b.shape) == 1

        self.x = x
        y = x @ self.w + self.b
        self.out_shape = y.shape
        return y
    
    def backward(self, d):
        assert d.shape == self.out_shape, (d.shape, self.out_shape)
        
        self.db = np.sum(d, axis=(0, 1))
        assert self.db.shape == self.b.shape
        
        d_dw = np.zeros([*d.shape, *self.w.shape])
        
        for i in range(d.shape[1]):
            for j in range(d.shape[2]):
                d_dw[:, i, j, :, j] = self.x[:, i, :]
        
        self.dw = np.tensordot(d, d_dw, axes=3)
        assert self.dw.shape == self.w.shape
        
        d_dx = np.zeros([*d.shape, *self.x.shape[1:]])
        for i in range(d.shape[1]):
            for j in range(d.shape[2]):
                d_dx[:, i, j, i, :] = self.w[:, j]

        d = np.tensordot(d, d_dx, axes=([1, 2], [1, 2])).sum(1) / len(d)
        assert d.shape == self.x.shape
        return d

    def step(self, lr):
        self.w = self.w - lr * self.dw
        self.b = self.b - lr * self.db

In [6]:
class ReLU:
    def __init__(self):
        self.a = None
        
    def forward(self, x):
        self.a = np.maximum(x, 0)
        return self.a
    
    def backward(self, d):
        return d * (self.a != 0).astype(np.float32)
        

In [7]:
class Softmax:
    def __init__(self):
        self.a = None
        
    def forward(self, x):
        assert len(x.shape) == 3
        x = x - np.max(x, axis=-1, keepdims=True)
        self.a = np.exp(x) / np.sum(np.exp(x), keepdims=True, axis=-1)
        return self.a
    def backward(self, d):
        
        diag = np.stack([np.diag(self.a[i, 0]) for i in range(len(self.a))])
        op = np.stack([np.outer(self.a[i, 0], self.a[i, 0]) for i in range(len(self.a))])
        J = diag - op
        
        return d @ J

In [8]:
class MeanSquaredError:
    def __init__(self):
        self.y_ = None
    def forward(self, y_, y):
        assert y_.shape == y.shape, (y_.shape, y.shape)
        self.y_ = y_
        self.y = y
        l = 0.5 * np.sum(np.square(y - y_)) / len(y_)
        return l
    
    def backward(self):
        d = -(self.y - self.y_) / len(self.y_)
        return d

In [9]:
class CrossEntropy:
    def forward(self, y_, y):
        self.y_ = y_
        self.y = y
        
        assert y_.shape == y.shape
        
        l = - np.sum(y * np.log(y_))
        l /= len(y)
        return y_, l
    
    def backward(self):
        y_ = self.y_
        y = self.y
        
        assert y_.shape == y.shape
        d = (- y / y_) / len(y_)
        return d

In [10]:
class CNN:
    def __init__(self):
        self.conv1 = Conv(1, 4, 3)
        self.relu1 = ReLU()
        self.conv2 = Conv(4, 8, 5)
        self.relu2 = ReLU()
        self.lin1 = Linear(32, 10)
        self.softmax = Softmax()
        # self.mse = MeanSquaredError()
        self.ce = CrossEntropy()
                
    def forward(self, x, t):
        z1 = self.conv1.forward(x)
        a1 = self.relu1.forward(z1)

        z2 = self.conv2.forward(a1)
        a2 = self.relu2.forward(z2)
        
        a2_r = a2.reshape(a2.shape[0], 1, 32)
        
        a3 = self.lin1.forward(a2_r)

        y = self.softmax.forward(a3)
                
        y, loss = self.ce.forward(y, t)       
        
        return y, loss
    
    def backward(self):
        dl_dy = self.ce.backward()
        dy_da3 = self.softmax.backward(dl_dy)

        dl_da2_r = self.lin1.backward(dy_da3)
        dl_da2 = dl_da2_r.reshape(dl_da2_r.shape[0], 8, 2, 2)
        dl_dz2 = self.relu2.backward(dl_da2)

        dl_da1 = self.conv2.backward(dl_dz2)
        dl_dz1 = self.relu1.backward(dl_da1)
        dl_dx = self.conv1.backward(dl_dz1)

    
    def step(self, lr):
        self.conv1.step(lr)
        self.conv2.step(lr)
        self.lin1.step(lr)

        

In [11]:
batch_size = 128

In [12]:
cnn = CNN()

In [13]:
for i in range(16):
    for j in range(0, len(x_n), batch_size):
        x_batch = x_n[j: j + batch_size]
        t_batch = t_n[j: j + batch_size]
        y_batch, loss = cnn.forward(x_batch, t_batch)
        cnn.backward()
        cnn.step(lr=0.1)
        if i % 1 == 0 and j == 0:
#             print(j)
            print('Loss', loss)
            print('Accuracy:', (np.argmax(y_batch, axis=-1) == np.argmax(t_batch, axis=-1)).sum() / len(y_batch))

Loss 2.2912227009921136
Accuracy: 0.09375
Loss 2.242544418623601
Accuracy: 0.2734375
Loss 2.076377864154841
Accuracy: 0.4375
Loss 1.524691886054848
Accuracy: 0.640625
Loss 0.8617892385286434
Accuracy: 0.7265625
Loss 0.5106373478791786
Accuracy: 0.875
Loss 0.33728525980108137
Accuracy: 0.9296875
Loss 0.27180674821441697
Accuracy: 0.9453125
Loss 0.2478048446905387
Accuracy: 0.9453125
Loss 0.23448731966680078
Accuracy: 0.9296875
Loss 0.22317860564383313
Accuracy: 0.921875
Loss 0.21297551286721356
Accuracy: 0.921875
Loss 0.20182121189628613
Accuracy: 0.9140625
Loss 0.19109531990835396
Accuracy: 0.921875
Loss 0.18068391817910906
Accuracy: 0.9140625
Loss 0.17129105213208934
Accuracy: 0.921875
