In [21]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from mlxtend.data import loadlocal_mnist
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


In [22]:
train_images,train_labels=loadlocal_mnist(images_path='dataset/train-images.idx3-ubyte', labels_path='dataset/train-labels.idx1-ubyte')
test_images,test_labels=loadlocal_mnist(images_path='dataset/t10k-images.idx3-ubyte', labels_path='dataset/t10k-labels.idx1-ubyte')


In [23]:
# train_images,train_labels=loadlocal_mnist(images_path='D:/Downloads/MNIST/raw/train-images-idx3-ubyte', labels_path='D:/Downloads/MNIST/raw/train-labels-idx1-ubyte')
# test_images,test_labels=loadlocal_mnist(images_path='D:/Downloads/MNIST/raw/t10k-images-idx3-ubyte', labels_path='D:/Downloads/MNIST/raw/t10k-labels-idx1-ubyte')

# np.save('train_images',train_images)
# np.save('train_labels',train_labels)
# np.save('test_images',test_images)
# np.save('test_labels',test_labels)

# train_df=train_df.to_numpy()
# test_df=test_df.to_numpy()
# train_images = train_df[:, 1:]/255
# test_images = test_df[:, 1:]/255
# train_labels = train_df[:, 0]
# test_labels = test_df[:, 0]


In [24]:

#pre processing
standardscalar = StandardScaler()

train_images = standardscalar.fit_transform(train_images)
test_images = standardscalar.transform(test_images)

data=[]
labels=[]
for i in range(len(train_images)):
    data.append(train_images[i])
    labels.append(train_labels[i])
for i in range(len(test_images)):
    data.append(test_images[i])
    labels.append(test_labels[i])

data=np.array(data)
labels=np.array(labels)

#random shuffling
data,labels=shuffle(data,labels)

#dividing by max value to convert every pixel to 0-1
data=data/255


In [25]:
data.shape

(70000, 784)

In [28]:
def train_val_test_split(data, labels, train, val):
    """ a function that will get dataset and training dataset fraction as input and return x_train, x_test, y_train, y_test """
    
    print("Total length is "+str(len(data)))
    
    train_set = len(data)*train//(train+val)
    val_set = len(data)*val//(train+val)
    
    X_train = data[:train_set]
    Y_train = labels[:train_set]
    
    X_val = data[train_set+1:train_set+val_set+1]
    Y_val = labels[train_set+1:train_set+val_set+1]
    
    return X_train, Y_train, X_val, Y_val

In [29]:
x_train, y_train, x_val, y_val = train_val_test_split(data, labels, 8, 2)

Total length is 70000


In [47]:
class MyNeuralNetwork():
  activations = ['relu','leaky_relu', 'sigmoid', 'linear', 'tanh', 'softmax']
  weight_initializations = ['zero', 'random', 'normal']
  
  def __init__(self, n_layers, layer_sizes, activation, weight_initialization, batch_size, iter=50, lr=0.01):
      self.n_layers = n_layers
      self.layer_sizes = layer_sizes
      self.activation = activation
      self.lr = lr
      self.weight_initialization = weight_initialization
      self.batch_size = batch_size
      self.iter = iter
      self.W, self.B = self.initialize_weights()  #Weight and Bias dictionary respectively
      self.A={}   #Activation values
      self.Z={}   #Pre activation values
      self.W_grad, self.B_grad = self.initialize_weights()
      self.A_grad={}
      self.Z_grad={}
      self.train_loss=[]
      self.val_loss=[]
      
  def activate(self, arr, name, back_prop=False):
    if name=='relu':
      arr[arr<0]=0
      if back_prop:
        arr[arr>0]=1
            
    elif name=='sigmoid':
      arr = 1 / ( 1 + np.exp(-arr))
      if back_prop :
        arr = arr * (1 - arr)

    elif name=='leaky_relu':
      if not back_prop:
        arr = [i if i>=0 else i*0.01 for i in arr]
      else:
        arr[arr<0]=0.01

    elif name=='linear':
      if back_prop:
        arr = np.ones(arr.shape)

    elif name=='tanh':
      arr = np.tanh(arr)
      if back_prop:
        arr = 1 - arr**2
    else:
      y = np.exp(arr)
      arr = y / np.sum(y, axis=1, keepdims=True)

    arr[arr<10**(-20)]=0
    arr[arr>1**(20)]=1**(20)
    return arr

  def w_initialize(self, name, shape):
    x = np.zeros(shape)
    if name == 'normal init':
      x = np.random.normal(shape) * 0.01
    elif name == 'random init':
      x = np.random.rand(shape[0], shape[1]) * 0.01

    return x

  def initialize_weights(self):

    weight = [0]
    bias = [0]

    for i in range(self.n_layers-1):
      weight.append(self.w_initialize(self.weight_initialization, (self.layer_sizes[i], self.layer_sizes[i+1])))
      bias.append(self.w_initialize(self.weight_initialization, (1, self.layer_sizes[i+1])))   
    
    return weight, bias

  def fit(self, x_train, y_train, x_val, y_val):

    validation_labels = np.zeros((len(x_val), max(y_val) + 1))
    for i in range(len(x_val)):
        validation_labels[i, y_val[i]] = 1
            
    for e in range(self.iter):

      loss, current_batch_index, times, validation_loss=0, 0, len(x_train) // self.batch_size, 0
          
      for current_batch_index in range(0, len(x_train), self.batch_size):
          
        curr_batch = x_train[current_batch_index: min(len(x_train), current_batch_index + self.batch_size)]
        curr_labels = y_train[current_batch_index: min(len(y_train), current_batch_index + self.batch_size)]
        
        #forward prop
        self.A[0] = curr_batch
        self.forward_propagation()
        
        #loss calc
        labels = np.zeros((len(curr_labels), max(y_train) + 1))
        for i in range(len(curr_labels)):
            labels[i][curr_labels[i]] = 1
        
        loss += self.cross_entropy_loss(self.A[self.n_layers - 1], labels)
        
        output = self.A[self.n_layers - 1]
        temp = output * (1 - output)
        self.A_grad[self.n_layers - 1] = (output - labels) / temp
        
        #backward pass for all layers
        self.backward_propagation()
            
        #weight update for all layers
        for l in range(1, self.n_layers):
          self.W[l] = self.W[l] - self.lr * self.W_grad[l]
          self.B[l] = self.B[l] - self.lr * self.B_grad[l]
            
            
        Y_pred = self.predict_proba(x_val)
        validation_loss = self.cross_entropy_loss(Y_pred, validation_labels)

      print("Training Loss of epoch " + str(e) + " is " + str(loss / times))
      print("Validation Loss of epoch " + str(e)+" is " + str(validation_loss))

      self.train_loss.append(loss / times)
      self.val_loss.append(validation_loss)
    
    self.plot_training_val()
      
  def plot_training_val(self):
    plt.plot(self.train_loss, 'b', label='training')
    plt.plot(self.val_loss, 'r', label='validation')
    plt.title("Loss vs epochs for " + self.activation + " for learning rate " + str(self.lr))
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
      
  def forward_propagation(self):

    for i in range(1, self.n_layers):
      self.Z[i] = np.dot(self.A[i-1], self.W[i])
      if i != 1:
          self.Z[i] += self.B[i]

      self.A[i] = self.activate(self.Z[i], self.activation)

    self.A[self.n_layers - 1] = self.activate(self.A[i], 'softmax')
      
  def backward_propagation(self):

    for i in range(self.n_layers-1, 0, -1):
      self.Z_grad[i] = self.activate(self.Z[i], self.activation, back_prop=True) * self.A_grad[i]
      self.W_grad[i] = np.dot(self.A[i-1].T, self.Z_grad[i]) / len(self.A_grad[i])
      self.B_grad[i] = np.sum(self.Z_grad[i], axis=0) / len(self.A_grad[i])
      self.B_grad[i] = self.B_grad[i].reshape(self.B[i].shape)
      self.A_grad[i-1] = np.dot(self.Z_grad[i], self.W[i].T)
      
  def predict_proba(self, X):
    preact={}
    act={}
    act[0]=X
    
    for l in range(1, self.n_layers):
        preact[l]=np.dot(act[l-1],self.W[l])
        if l!=1:
            preact[l]+=self.B[l]
            
        act[l]=self.activate(preact[l],self.activation)
        
    ans = self.activate(act[self.n_layers-1],'softmax')
    return ans
  
  def predict(self, X):
    y=self.predict_proba(X)

    # return the numpy array y which contains the predicted values
    return y.argmax(axis=1)
  
  def cross_entropy_loss(self, A, y):
    #check for problems

    n = len(y)
    logp = - np.log(A[np.arange(n), y.argmax(axis=1)]+1e-10)
    loss = np.sum(logp+1e-10)/n
    return loss
  
  def score(self, X , y_true):
    y_pred=self.predict(X)
    
    return np.sum(y_true==y_pred)/len(y_true)

In [49]:
net=MyNeuralNetwork(4, [256, 128, 64, 32], 'relu', 'normal', 256)
net.fit(x_train, y_train, x_val, y_val)

ValueError: shapes (256,784) and (256,128) not aligned: 784 (dim 1) != 256 (dim 0)

In [None]:
net.fit(train_images, train_labels, test_images, test_labels)

In [None]:
net.score(test_images,test_labels)

pred


0.1135

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(256,128,64),random_state=1, max_iter=5, verbose=True).fit(train_images, train_labels)

In [None]:
clf.out_activation_

'softmax'

In [None]:
clf.score(test_images,test_labels)

0.9786

In [None]:
clf.get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (256, 128, 64),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 5,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 1,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': True,
 'warm_start': False}