In [1]:
import numpy as np
from layers import *
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

In [2]:
def make_one(x):
    if x.ndim == 2:
        x = np.ravel(x.values)
    t = np.zeros((x.size,np.unique(x).size))
    for i in range(t.shape[0]):
        t[i,x[i]] = 1
    return t

In [18]:
# from sklearn.datasets import load_iris
# X = load_iris().data
# y = load_iris().target
# y = make_one(y)
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [3]:
mnist = fetch_openml('mnist_784')

In [4]:
X = mnist.data
y = mnist.target

X = X.astype(np.float32).values/255. # minmax scale
y = y.astype(np.int32).values
y = make_one(y)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.4)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size=.5)

In [17]:
input_size = X.shape[1]
hidden_size = [100,200,100]
output_size = y.shape[1]
model = MultiLayer(input_size, hidden_size, output_size)

In [18]:
class SGD:
    def __init__(self,lr=1e-3):
        self.lr = lr
        
    def minimize(self,w,grads):
        for key in w.keys():
            w[key] -= self.lr*grads[key] 

In [33]:
class Momentum:
    def __init__(self,lr=1e-3,m=0.9):
        self.lr = lr
        self.momentum = m
        self.v = None
        
    def minimize(self,w,grads):
        if self.v is None:
            self.v = {}
            for k,v in w.items():
                self.v[k] = np.zeros_like(v)
        else:
            for key in w.keys():
                self.v[key] = self.momentum + self.v[key] - self.lr*grads[key]
                w[key] += self.v[key]

In [37]:
optimizer = Momentum(lr=1e3)

In [38]:
epochs = 100
for epoch in range(epochs):
    grads = model.gradient(X_train,y_train)
    optimizer.minimize(model.W,grads)
    if epoch % 20 == 0:
        print(f"epoch {epoch}:val_loss==========={np.round(model.loss(X_val,y_val),4)}, val_acc:========{np.round(model.accuracy(X_val,y_val),4)*100}%")
        model.loss_val.append(model.loss(X_val,y_val))
        model.acc_val.append(model.accuracy(X_val,y_val))



In [23]:
model.fit_gd(10000,1e-2,X_train,y_train,X_test,y_test)



KeyboardInterrupt: 

In [8]:
model.gradient(X_train,y_train)

{'W1': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'b1': array([-1.42835245e+00,  1.36740035e-01,  1.70282395e+00,  3.50953407e+00,
         1.81299462e-01,  6.51175365e-02, -6.11642803e+00, -1.93439950e-01,
        -1.71150946e+00, -2.89232339e+00,  8.56510931e-01, -2.26897971e+00,
         1.25784241e+00,  2.14098370e+00, -1.24029932e+00,  4.67611601e+00,
         1.37681080e-01, -1.85400597e-01,  8.34815612e-02, -1.13305024e+00,
         4.27380933e+00,  1.75256475e+00,  1.52973334e+00,  8.86892366e-01,
        -3.63150361e-01,  1.89947023e+00, -1.42878932e+00, -1.30433995e-01,
        -2.38226052e-01, -2.28852423e+00,  8.02219827e-01, -3.06298161e+00,
         2.73229749e+00, -8.85626823e-02,  2.49608235e+00,  4.20837628e+00,
         4.17846716e-01,  1.14668581e-01, -3.73793784e-

In [39]:
class MultiLayer:
    def __init__(self,input_size,hidden_size,output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.hidden_size.insert(0,self.input_size)
        self.hidden_size.append(self.output_size)
        self.W = {}
        for i in range(len(hidden_size)-1):
            w_key = 'W'+str(i+1)
            b_key = 'b'+str(i+1)
            self.W[w_key] = np.random.randn(hidden_size[i],hidden_size[i+1])
            self.W[b_key] = np.random.randn(hidden_size[i+1])
            
        self.layers = OrderedDict()
        
        for i in range(int(len(self.W)/2)-1):
            aff_key = 'Affine_'+str(i+1)
            relu_key = 'Relu_'+str(i+1)
            w_key = 'W'+str(i+1)
            b_key = 'b'+str(i+1)
            self.layers[aff_key] = Affine(self.W[w_key],self.W[b_key])
            self.layers[relu_key] = Relu()
        
        last_num = str(int(len(self.W)/2))
        self.layers['Affine_'+last_num] = Affine(self.W['W'+last_num],self.W['b'+last_num])
        self.Lastlayer = SoftmaxWithLoss()
        self.loss_val = []
        self.acc_val = []
    
    #def summary(self):
        
    
    def predict(self,x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    def loss(self,x,t):
        y = self.predict(x)
        loss = self.Lastlayer.forward(y,t)
        return loss

    def gradient(self,x,t):
        self.loss(x,t)
        dout = 1
        dout = self.Lastlayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
        
        grads = {}
        layer_number = int(len(self.layers.keys())/2)
        for i in range(1,layer_number+2):
            grads['W'+str(i)] = self.layers['Affine'+str(i)].dW
            grads['b'+str(i)] = self.layers['Affine'+str(i)].db
            
        return grads
    
    def accuracy(self,x,t):
        y = np.argmax(self.predict(x),axis=1)
        t = np.argmax(t, axis=1)
        acc = np.sum(y==t)/y.size
        return acc
            
    
    def fit(self,epochs,batch_size,lr,x,t,x_val,t_val):
        if divmod(x.shape[0],batch_size)[1] > 0:
            batch = divmod(x.shape[0],batch_size)[0] + 1
        else:
            batch = divmod(x.shape[0],batch_size)[0]
        for epoch in range(epochs):
            if epoch == 0:
                start = 0
            end = start + batch_size
            if epoch == epochs-1 and divmod(x.shape[0],batch_size)[1] != 0:
                end = start+divmod(x.shape[0],batch_size)[1]
            x_tmp = x[start:end,:]
            t_tmp = t[start:end,:]
            start = end
            for i in range(batch):
                grads = self.gradient(x_tmp,t_tmp)
            for key in grads.keys():
                self.W[key] -=  lr*grads[key]
            if epoch % 20 == 0:
                print(f"epoch {epoch}:val_loss==========={np.round(self.loss(x_val,t_val),4)}, val_acc:========{np.round(self.accuracy(x_val,t_val),4)*100}%")
                self.loss_val.append(self.loss(x_val,t_val))
                self.acc_val.append(self.accuracy(x_val,t_val))
                
    def fit_gd(self,epochs,lr,x,t,x_val,t_val):
        for epoch in range(epochs):
            grads = self.gradient(x,t)
            for key in grads.keys():
                self.W[key] -= lr*grads[key]
            if epoch % 20 == 0:
                print(f"epoch {epoch}:val_loss==========={np.round(self.loss(x_val,t_val),4)}, val_acc:========{np.round(self.accuracy(x_val,t_val),4)*100}%")
                self.loss_val.append(self.loss(x_val,t_val))
                self.acc_val.append(self.accuracy(x_val,t_val))