In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
import torch
import os
from sklearn.utils import shuffle

In [2]:
from sklearn.datasets import fetch_openml

if os.path.exists('X.npy') and os.path.exists('y.npy'):
    X = np.load('X.npy', allow_pickle=True)
    y = np.load('y.npy', allow_pickle=True)
else:
    X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
    np.save('X.npy', X)
    np.save('y.npy', y)

In [3]:
# X, y = load_digits(return_X_y=True)

y = np.eye(10)[y.astype(np.int32)].astype(np.float32)
X = StandardScaler().fit_transform(X)

In [4]:
X.shape, y.shape

((70000, 784), (70000, 10))

In [5]:
num_features = 784

In [6]:
X, y = shuffle(X, y)

In [7]:
w1 = torch.repeat_interleave(torch.linspace(-0.1, 0.1, 32)[None, ...], num_features, dim=0)
w2 = torch.repeat_interleave(torch.linspace(-0.1, 0.1, 10)[None, ...], 32, dim=0)

# w1 = torch.randn(73, 32) * 0.01
# w2 = torch.randn(32, 10) * 0.01

b1 = torch.zeros(32)
b2 = torch.zeros(10)

X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()

In [8]:
batch_size = 1024

In [9]:
X.shape, y.shape

((70000, 784), (70000, 10))

In [10]:
lr = 0.01

In [11]:
for i in range(10):
    for j in range(0, len(X), batch_size):
        
        X_batch = X_t[j: j + batch_size]
        y_batch = y_t[j: j + batch_size]


        w1.requires_grad = True
        b1.requires_grad = True
        w2.requires_grad = True
        b2.requires_grad = True


        z1 = X_batch @ w1 + b1
    #     print(torch.norm(z1))
        a1 = torch.max(torch.zeros_like(z1), z1)
    #     a1 = torch.nn.functional.relu(z1)
    #     print(torch.norm(a1))

        z2 = a1 @ w2 + b2
        sm_scale = z2 - torch.max(z2, dim=-1, keepdims=True)[0]
        y__t = torch.exp(sm_scale) / torch.sum(torch.exp(sm_scale), keepdims=True, axis=-1)
        assert y__t.shape == y_batch.shape
    #     print(torch.norm(y__t))
        logy = torch.log(y__t)
    #     logy.requires_grad = True

    #     print(torch.norm(logy))
        ce = - y_batch * logy
        l = torch.sum(ce) / len(y_batch)
    #     l = torch.sum(ce)
        l.backward()
#         print(torch.norm(w1.grad))
#         print(torch.norm(w2.grad))

        if j % 8 * batch_size == 0:
            print(l)
            print((torch.max(y__t, dim=-1)[1] == torch.max(y_batch, dim=-1)[1]).sum().float() / len(y_batch))

        w1 = w1.data - lr * w1.grad
        w2 = w2.data - lr * w2.grad

        b1 = b1.data - lr * b1.grad
        b2 = b2.data - lr * b2.grad


    
    

tensor(8.4004, grad_fn=<DivBackward0>)
tensor(0.1104)
tensor(7.0642, grad_fn=<DivBackward0>)
tensor(0.1172)
tensor(6.3599, grad_fn=<DivBackward0>)
tensor(0.1299)
tensor(5.7995, grad_fn=<DivBackward0>)
tensor(0.1357)
tensor(5.1563, grad_fn=<DivBackward0>)
tensor(0.1250)
tensor(4.5555, grad_fn=<DivBackward0>)
tensor(0.1260)
tensor(4.3211, grad_fn=<DivBackward0>)
tensor(0.1367)
tensor(3.9158, grad_fn=<DivBackward0>)
tensor(0.1504)
tensor(3.5749, grad_fn=<DivBackward0>)
tensor(0.1240)
tensor(3.3205, grad_fn=<DivBackward0>)
tensor(0.1201)
tensor(2.9268, grad_fn=<DivBackward0>)
tensor(0.1162)
tensor(2.6954, grad_fn=<DivBackward0>)
tensor(0.1836)
tensor(2.5745, grad_fn=<DivBackward0>)
tensor(0.1719)
tensor(2.5182, grad_fn=<DivBackward0>)
tensor(0.1787)
tensor(2.3609, grad_fn=<DivBackward0>)
tensor(0.1787)
tensor(2.3237, grad_fn=<DivBackward0>)
tensor(0.1943)
tensor(2.2600, grad_fn=<DivBackward0>)
tensor(0.2197)
tensor(2.1891, grad_fn=<DivBackward0>)
tensor(0.1982)
tensor(2.1719, grad_fn=<DivB

In [13]:
class Linear:
    def __init__(self, in_dim, out_dim):
        self.w = np.zeros([in_dim, out_dim])
        self.w = np.repeat(np.linspace(-0.1, 0.1, out_dim)[np.newaxis, ...], in_dim, axis=0)

#         self.w = np.random.randn(in_dim, out_dim)
        self.b = np.zeros([1, out_dim])
        self.dw = None
        self.db = None
        self.in_dim = self.w.shape[0]
        self.out_dim = self.w.shape[1]

        
    def forward(self, x):
        self.x = x
        return np.matmul(x, self.w) + self.b
    
    def backward(self, d):
        self.db = np.mean(d, axis=0)
        assert self.db.shape == self.b.shape, (d.shape, self.db.shape, self.b.shape)
        
        J = np.zeros([self.x.shape[0], self.out_dim, np.prod(self.w.shape)])
        j = 0
        for i in range(self.out_dim):
            J[:, i: i + 1, j: j + self.in_dim] = self.x[:, np.newaxis, :]
            j += self.in_dim
        
        dw = d @ J
        
        dw = np.reshape(np.mean(dw, axis=0), self.w.shape, order='F')
        
        self.dw = dw
        
        d = d @ np.repeat(self.w.T[np.newaxis, ...], d.shape[0], axis=0)
        
        return d
        
    def step(self, lr):
        self.w = self.w - lr * self.dw
        self.b = self.b - lr * self.db

In [14]:
class ReLU:
    def __init__(self):
        self.a = None
        
    def forward(self, x):
        self.a = np.maximum(x, 0)
        return self.a
    
    def backward(self, d):
        return d * (self.a != 0)[:, np.newaxis, :].astype(np.float32)
        

In [15]:
class Softmax:
    def __init__(self):
        self.a = None
        
    def forward(self, x):
        assert len(x.shape) == 2
        x = x - np.max(x, axis=-1, keepdims=True)
        self.a = np.exp(x) / np.sum(np.exp(x), keepdims=True, axis=-1)
        return self.a
    def backward(self, d):
        
        diag = np.stack([np.diag(self.a[i]) for i in range(len(self.a))])
        op = np.stack([np.outer(self.a[i], self.a[i]) for i in range(len(self.a))])
        J = diag - op
        
        return d[:, np.newaxis, ...] @ J

In [16]:
class CrossEntropy:
    def forward(self, y_, y):
        
        l = - np.sum(y * np.log(y_))
        l /= len(y)
        return y_, l
    
    def backward(self, y_, y):
        assert y_.shape == y.shape
        d =  - y / y_
        return d

In [17]:
class MLP:
    def __init__(self):
        self.linear1 = Linear(784, 32)
        self.relu1 = ReLU()
        self.linear2 = Linear(32, 10)
#         self.relu2 = ReLU()
#         self.linear3 = Linear(32, 10)
        self.softmax = Softmax()
        self.loss = CrossEntropy()
        
    def forward(self, x, y):
        x = self.linear1.forward(x)
        x = self.relu1.forward(x)
        x = self.linear2.forward(x)
#         x = self.relu2.forward(x)
#         x = self.linear3.forward(x)
        x = self.softmax.forward(x)
        loss = self.loss.forward(x, y)
        return loss
    
    def backward(self, y_, y):
        d = self.loss.backward(y_, y)
        d = self.softmax.backward(d)
#         d = self.linear3.backward(d)
#         d = self.relu2.backward(d)
        d = self.linear2.backward(d)
        d = self.relu1.backward(d)
        d = self.linear1.backward(d)

    
    def step(self, lr):
#         print(np.linalg.norm(self.linear1.dw))
        self.linear1.step(lr)
#         print(np.linalg.norm(self.linear2.dw))
        self.linear2.step(lr)
#         self.linear3.step(lr)

        

In [18]:
mlp = MLP()

In [20]:
for i in range(10):
    for j in range(0, len(X), batch_size):
#     for j in range(0, 5 * batch_size, batch_size):

        X_batch = X[j: j + batch_size]
        y_batch = y[j: j + batch_size]
        y_, loss = mlp.forward(X_batch, y_batch)
        mlp.backward(y_, y_batch)
        mlp.step(lr=0.01)
        if j % batch_size * 8 == 0:
#             print(j)
            print(loss, 'loss')
            print((np.argmax(y_, axis=-1) == np.argmax(y_batch, axis=-1)).sum() / len(y_batch))

2.050843424115447 loss
0.2275390625
2.0244736062109494 loss
0.2373046875
2.1053338333531864 loss
0.228515625
2.02092516573021 loss
0.2470703125
2.0224373295976354 loss
0.2451171875
2.0677909596961075 loss
0.2236328125
2.058034708956967 loss
0.2412109375
2.040425398074608 loss
0.2353515625
2.0685329166410606 loss
0.2255859375
2.0138914015778093 loss
0.2548828125
2.0402885936706388 loss
0.2353515625
2.053788074151969 loss
0.236328125
2.014137313971744 loss
0.2578125
2.044875742114674 loss
0.2607421875
2.0332065063579243 loss
0.23828125
2.0482528123948907 loss
0.244140625
2.0318649332337495 loss
0.2666015625
1.9916577757922862 loss
0.267578125
2.0268667820872572 loss
0.240234375
1.9878172008691142 loss
0.267578125
2.0171780746230494 loss
0.2607421875
2.0438002106453106 loss
0.251953125
2.0181849706575283 loss
0.2744140625
1.9880703013141579 loss
0.251953125
2.0029565742292172 loss
0.2685546875
2.025949639183146 loss
0.2451171875
2.0143684647519624 loss
0.2578125
2.0188957316625613 loss
0.