# 1. Load Data

In [72]:
import numpy as np
import copy
from time import time

X_train = np.load('train_data.npy')
X_test = np.load('test_data.npy')
y_train = np.load('train_label.npy')
y_test = np.load('test_label.npy')

In [73]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((50000, 128), (10000, 128), (50000, 1), (10000, 1))

# 2. Preprocessing

In [74]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_data_scaled = scaler.fit_transform(X_train)
test_data_scaled = scaler.transform(X_test)

train_data_min, train_data_max = train_data_scaled.min(), train_data_scaled.max()
test_data_min, test_data_max = test_data_scaled.min(), test_data_scaled.max()

train_data_min, train_data_max, test_data_min, test_data_max


(0.0, 1.0000000000000002, -0.2341860850703863, 1.1540823219473646)

# 3. Modules

## 3.1 Activation Function

In [75]:
class Activation(object):
    def __relu(self, x):
        return np.where(x >= 0, x, 0)

    def __relu_derivative(self, a):
        return np.where(a >= 0, 1, 0)

    def __softmax(self, x):
        x_exponent = np.exp(x - np.max(x, axis = -1, keepdims = True))
        return x_exponent / np.sum(x_exponent, axis = -1, keepdims = True)

    def __init__(self, activation = 'relu'):

        if activation == 'relu':
            self.function = self.__relu
            self.function_derivative = self.__relu_derivative
        elif activation == 'softmax':
            self.function = self.__softmax

## 3.2 Layer

In [76]:
class Layer(object):

    def __init__(self, n_in, n_out, optimizer, activation = 'relu'):
        self.input = None
        self.output_before_activation = None
        self.output = None

        self.activation = Activation(activation).function
        if activation == 'softmax':
            self.activation_derivative = None
        else:
            self.activation_derivative = Activation(activation).function_derivative

        self.Weight = np.random.uniform(
            low = -np.sqrt(6 / (n_in + n_out)),
            high = np.sqrt(6 / (n_in + n_out)),
            size = (n_in, n_out)
        )
        self.bias = np.zeros(n_out,)

        self.optimizer_weight = copy.copy(optimizer)
        self.optimizer_bias = copy.copy(optimizer)

    def forward(self, input, train = True):

        self.input = input
        self.output_before_activation = np.dot(input, self.Weight) + self.bias
        self.output = self.activation(self.output_before_activation)

        return self.output

    def backward(self, delta):

        if self.activation_derivative:
            delta = delta * self.activation_derivative(self.output_before_activation)

        grad_weight = np.dot(self.input.T, delta)
        grad_bias = np.sum(delta, axis=0, keepdims=True)

        self.Weight = self.optimizer_weight.update(self.Weight, grad_weight)
        self.bias = self.optimizer_bias.update(self.bias, grad_bias)

        delta = np.dot(delta, self.Weight.T)

        return delta

## 3.3 Dropout

In [77]:
class DropoutLayer(object):

    def __init__(self, drop_prob: float = 0.5):
        self.drop_prob = drop_prob
        self.mask = None

    def forward(self, X: np.ndarray, train: bool = True) -> np.ndarray:
        if train:
            self.mask = np.random.rand(*X.shape) >= self.drop_prob
            return X * self.mask
        else:
            return X * (1 - self.drop_prob)

    def backward(self, delta: np.ndarray) -> np.ndarray:
        return delta * self.mask if self.mask is not None else delta

## 3.4 BatchNormalization

In [97]:
class BatchNormalization(object):

    def __init__(self, gamma, beta, optimizer, momentum = 0.9):

        self.gamma = gamma
        self.beta = beta
        self.momentum = momentum
        self.mean = 0
        self.var = 1
        self.gamma_optimizer = copy.copy(optimizer)
        self.beta_optimizer = copy.copy(optimizer)

    def forward(self, X, train = True):

        if self.mean is None:
            self.mean = np.mean(X, axis = 0)
            self.var = np.var(X, axis = 0)

        if train:
            mean = np.mean(X, axis = 0)
            self.mean = self.momentum * self.mean + (1 - self.momentum) * mean
            var = np.var(X, axis = 0)
            self.var = self.momentum * self.var + (1 - self.momentum) * var
        else:
            mean = self.mean
            var = self.var

        self.X_minus_mean = X - mean
        self.std = np.sqrt(var + 1e-6)
        self.X_norm = self.X_minus_mean / self.std
        output = self.gamma * self.X_norm + self.beta

        return output

    def backward(self, delta):

        gamma_old = self.gamma

        gamma_grad = np.sum(delta * self.X_norm, axis = 0)
        beta_grad = np.sum(delta, axis = 0)

        self.gamma = self.gamma_optimizer.update(self.gamma, gamma_grad)
        self.beta = self.beta_optimizer.update(self.beta, beta_grad)

        dX_norm = delta * gamma_old
        dvar = np.sum(dX_norm * self.X_minus_mean, axis = 0) * (-0.5) * (self.var + 1e-6)**(-3/2)
        dmean = np.sum(dX_norm * (1/self.std), axis = 0) + dvar * (1/delta.shape[0]) * np.sum(-2 * self.X_minus_mean, axis = 0)
        delta = (dX_norm * (1/self.std)) + (dmean / delta.shape[0]) + (dvar * 2 / delta.shape[0] * self.X_minus_mean)

        return delta

## 3.5 Optimizer

In [90]:
class Optimizer(object):

    def __init__(self, lr = 0.001, momentum = 0.9, weight_decay: float = 1e-2):

        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.grad = None

    def update(self, weight, delta):

        if self.grad is None:
            self.grad = np.zeros(weight.shape)

        self.grad = self.momentum * self.grad + (1 - self.momentum) * delta
        weight = weight * (1 - self.weight_decay) - self.lr * self.grad

        return weight

## 3.6 Modules

In [91]:
class MLP(object):

    def __init__(self, n_in, n_out, layers, optimizer, activation, BN=False, Dropout=False, dropout_prob=None):

        self.layers = []
        self.activation = activation
        self.optimizer = optimizer
        self.lr = self.optimizer.lr
        self.n_out = n_out

        self.layers.append(Layer(n_in, layer[0], optimizer, activation[0]))
        if Dropout:
            self.layers.append(DropoutLayer(dropout_prob[0]))
        if BN:
            self.layers.append(BatchNormalization(np.ones((1, layer[0])), np.zeros((1, layer[0])), optimizer))

        for i in range(1, len(layer)):
            self.layers.append(Layer(layer[i-1], layer[i], optimizer, activation[i]))
            if Dropout:
                self.layers.append(DropoutLayer(dropout_prob[i]))
            if BN:
                self.layers.append(BatchNormalization(np.ones((1, layer[i])), np.zeros((1, layer[i])), optimizer))

        self.layers.append(Layer(layer[-1], n_out, optimizer, activation[-1]))

    def CE_loss(self, y, predict_y):

        y_onehot = np.eye(self.n_out)[y].reshape(-1, self.n_out)
        predict_y = np.clip(predict_y, 1e-15, 1 - 1e-15)
        loss = -np.sum(np.multiply(y_onehot, np.log(predict_y)))
        delta = predict_y - y_onehot
        return loss, delta

    def forward(self, input, train = True):

        output = input
        for layer in self.layers:
            output = layer.forward(output, train)
        return output

    def backward(self, delta):

        for layer in reversed(self.layers):
            delta = layer.backward(delta)

    def fit(self, X, y, epochs = 100, batch_size = 100, print_per = 50):

        loss_list = []
        accuracy_list = []

        for epoch in range(epochs):

            if epoch == int(epochs*1/3):
                self.lr = self.lr / 5
                self.optimizer = Optimizer(lr = self.lr)
            elif epoch == int(epochs*2/3):
                self.lr = self.lr / 5
                self.optimizer = Optimizer(lr = self.lr)

            loss_temp = 0
            predict_y_all_batch = []

            start = time()

            for index in np.arange(0, X.shape[0], batch_size):
                X_batch = X[index: min(index+batch_size, X.shape[0])]
                y_batch = y[index: min(index+batch_size, X.shape[0])]

                predict_y = self.forward(X_batch)

                loss, delta = self.CE_loss(y_batch, predict_y)

                self.backward(delta)

                loss_temp += loss
                predict_y_all_batch.extend(predict_y)

            predict_y_all_batch = np.array(predict_y_all_batch)

            loss_list.append(loss_temp / X.shape[0])
            predict_y = np.argmax(predict_y_all_batch, axis = 1).reshape(-1,1)
            accuracy = np.sum(predict_y == y, axis = 0) / X.shape[0]
            accuracy_list.append(accuracy)

            if (epoch + 1) % print_per == 0:
                print("Epoch: %d\tTime: %.2fs\tLoss: %.5f\tAccuracy: %.2f%%" % (epoch+1, time()-start, loss_list[-1], accuracy_list[-1]*100))

        return loss_list, accuracy_list

    def predict(self, X, y):

        predict_y = self.forward(X, train = False)

        loss, _ = self.CE_loss(y, predict_y)

        accuracy = np.sum(np.argmax(predict_y, axis = 1).reshape(-1,1) == y, axis = 0) / X.shape[0]

        print("Loss: %.5f\tAccuracy:%.2f%%" % (loss, accuracy*100))

# 4. Adjust Parameters

In [98]:
optimizer = Optimizer(lr = 0.001, momentum = 0.8, weight_decay = 1e-3)

n_in = X_train.shape[1]
n_out = len(np.unique(y_train))

layer = [256, 512]

activation = ['relu', 'relu', 'softmax']
model = MLP(n_in, n_out, layer, optimizer, activation, BN = True, Dropout = True, dropout_prob = [0.2, 0.2])

loss, accuracy = model.fit(X_train, y_train, epochs = 30, batch_size = 500, print_per = 3)

print()

model.predict(X_test, y_test)

  print("Epoch: %d\tTime: %.2fs\tLoss: %.5f\tAccuracy: %.2f%%" % (epoch+1, time()-start, loss_list[-1], accuracy_list[-1]*100))


Epoch: 3	Time: 6.44s	Loss: 1.49417	Accuracy: 46.65%
Epoch: 6	Time: 5.05s	Loss: 1.36256	Accuracy: 51.05%
Epoch: 9	Time: 5.12s	Loss: 1.29693	Accuracy: 53.67%
Epoch: 12	Time: 6.39s	Loss: 1.25332	Accuracy: 55.06%
Epoch: 15	Time: 4.98s	Loss: 1.23128	Accuracy: 55.83%
Epoch: 18	Time: 4.95s	Loss: 1.21056	Accuracy: 56.75%
Epoch: 21	Time: 6.34s	Loss: 1.19408	Accuracy: 57.20%
Epoch: 24	Time: 6.28s	Loss: 1.18446	Accuracy: 57.88%
Epoch: 27	Time: 4.98s	Loss: 1.16928	Accuracy: 58.31%
Epoch: 30	Time: 5.02s	Loss: 1.16276	Accuracy: 58.68%

Loss: 12927.07597	Accuracy:53.76%


  print("Loss: %.5f\tAccuracy:%.2f%%" % (loss, accuracy*100))


In [99]:
optimizer = Optimizer(lr = 0.001, momentum = 0.8, weight_decay = 1e-3)

n_in = X_train.shape[1]
n_out = len(np.unique(y_train))

layer = [256, 512]

activation = ['relu', 'relu', 'softmax']
model = MLP(n_in, n_out, layer, optimizer, activation, BN = True, Dropout = True, dropout_prob = [0.2, 0.2])

loss, accuracy = model.fit(X_train, y_train, epochs = 50, batch_size = 1000, print_per = 3)

print()

model.predict(X_test, y_test)

  print("Epoch: %d\tTime: %.2fs\tLoss: %.5f\tAccuracy: %.2f%%" % (epoch+1, time()-start, loss_list[-1], accuracy_list[-1]*100))


Epoch: 3	Time: 6.11s	Loss: 1.53511	Accuracy: 45.06%
Epoch: 6	Time: 5.52s	Loss: 1.39775	Accuracy: 50.10%
Epoch: 9	Time: 4.89s	Loss: 1.31259	Accuracy: 53.09%
Epoch: 12	Time: 5.00s	Loss: 1.25346	Accuracy: 54.94%
Epoch: 15	Time: 6.15s	Loss: 1.21068	Accuracy: 56.55%
Epoch: 18	Time: 5.03s	Loss: 1.17362	Accuracy: 57.97%
Epoch: 21	Time: 4.80s	Loss: 1.14920	Accuracy: 58.92%
Epoch: 24	Time: 5.72s	Loss: 1.13008	Accuracy: 59.74%
Epoch: 27	Time: 6.06s	Loss: 1.10346	Accuracy: 60.75%
Epoch: 30	Time: 4.76s	Loss: 1.08555	Accuracy: 61.23%
Epoch: 33	Time: 4.82s	Loss: 1.06747	Accuracy: 61.99%
Epoch: 36	Time: 4.84s	Loss: 1.06553	Accuracy: 62.02%
Epoch: 39	Time: 6.23s	Loss: 1.05769	Accuracy: 62.33%
Epoch: 42	Time: 5.70s	Loss: 1.03491	Accuracy: 63.13%
Epoch: 45	Time: 4.74s	Loss: 1.02680	Accuracy: 63.34%
Epoch: 48	Time: 4.70s	Loss: 1.01816	Accuracy: 63.88%

Loss: 12587.93686	Accuracy:55.47%


  print("Loss: %.5f\tAccuracy:%.2f%%" % (loss, accuracy*100))


# 5. 预测(测试集)