## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Importing the dataset

In [2]:
#Importing the data from CSV file
df = pd.read_csv("mnist_train.csv")
X=df.iloc[:,1:].values
Y=df.iloc[:,:1].values
X=X.reshape(X.shape[0],1,28,28)

#Taking 2500 points
X=X[:2500]
Y=Y[:2500]

#One-hot encoding the output matrix
Y=np.array([[1 if Y[i]==j else 0 for j in range(10)] for i in range(len(Y))])

#Dividing the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

## Defining the CNN class

In [3]:
class Layer:
    
    def __init__(self,no_input_channels,no_output_channels,learning_rate=0.1):
        
        #Step size
        self.learning_rate=learning_rate
        
        #Storing the number of input Channels
        self.no_input_channels=no_input_channels
        
        #Storing the number of output channels
        self.no_output_channels=no_output_channels
        
        #Generating 3x3 filters based on number of input and output channels
        self.weights=np.array([[np.random.randn(3,3) for i in range(self.no_input_channels)] for j in range(self.no_output_channels)])
    
    #Flattening Function
    def flatten(self,x):
        return x.ravel()
    
    #Forward Function
    def forward(self,X):
        
        #Extracting the number of samples
        self.no_samples=X.shape[0]
        self.X_train=X
        
        #Initializing the output matrix 
        out=np.zeros((X.shape[0],self.no_output_channels,X.shape[2],X.shape[3]))
        
        #Running a loop over the samples
        for i in range(self.no_samples):
            
            #Running a loop over the input channels
            for j in range(self.no_input_channels):
                
                #Padding the current channel
                padded_channel=np.pad(X[i][j], 1, mode='constant')
                
                #Running a loop over the number of output channels
                for k in range(self.no_output_channels):
                    
                    #Performing the convolution
                    for m in range(padded_channel.shape[0]-self.weights[k][j].shape[0]+1):
                        for n in range(padded_channel.shape[1]-self.weights[k][j].shape[1]+1):
                            out[i][k][m][n]+=np.sum(np.multiply(padded_channel[m:m+self.weights[k][j].shape[0],n:n+self.weights[k][j].shape[1]],self.weights[k][j]))
         
        #Returning the output
        return np.array([self.flatten(item) for item in out])
    
    
    def backward(self,derivative_loss_wrt_output):
        
        #Reshaping the next input gradient
        derivative_loss_wrt_output=derivative_loss_wrt_output.reshape(self.no_samples,self.no_output_channels,self.X_train.shape[2],self.X_train.shape[3])
        
        #Calculating the gradient of Loss with respect to weights
        derivative_loss_wrt_weights=np.zeros(self.weights.shape)
        for i in range(self.no_samples):
            for j in range(self.no_input_channels):
                padded_channel=np.pad(self.X_train[i][j], 1, mode='constant')
                for k in range(self.no_output_channels):
                        for m in range(padded_channel.shape[0]-derivative_loss_wrt_output[i][k].shape[0]+1):
                            for n in range(padded_channel.shape[1]-derivative_loss_wrt_output[i][k].shape[1]+1):
                                derivative_loss_wrt_weights[k][j][m][n]+=np.sum(np.multiply(padded_channel[m:m+derivative_loss_wrt_output[i][k].shape[0],n:n+derivative_loss_wrt_output[i][k].shape[1]],derivative_loss_wrt_output[i][k]))
        
        #Updating the weights
        self.weights-=self.learning_rate*derivative_loss_wrt_weights
        return


## Creating the model for MNIST data

In [4]:
class mnist_CNN:
    def __init__(self):
        
        #First Layer with 16 output channels
        self.layer1=Layer(1,16)
        
        #Initializing the weights for the last layer(Fully-connected Layer)
        self.weights=np.zeros((16*784,10))
        
        #Number of iterations
        self.iterations=15
        
        #Batch-size
        self.batch_size=10
    
    #Function which takes the data and update the weights
    def fit(self,X_train,y_train):
        
        #Running a loop over the number of iterations
        for iter in range(self.iterations):
            
            #Running a loop over the data
            for j in range(0,len(X_train),self.batch_size):
                
                #Performing the forward function from layer-1
                a1=self.layer1.forward(X_train[j:j+self.batch_size,:])
                
                #Using tanh activation
                out1=np.tanh(a1)
                
                #Calculating the output from fully connected layer
                a2=np.dot(out1,self.weights)
                
                #Using softmax activation
                out2=np.exp(a2)/np.sum(np.exp(a2),axis=1).reshape(len(a2),1)
                
                #Calculating the gradient of loss wrt output from fully connected layer
                derivative_loss_wrt_a2=out2-y_train[j:j+self.batch_size]
                
                #Calculating the gradient of loss wrt weights from fully connected layer
                derivative_loss_wrt_weights=np.array([[ np.sum(np.multiply(derivative_loss_wrt_a2[:,j],out1[:,i])) for j in range(10)] for i in range(self.weights.shape[0])])
                
                #Calculating the gradient of loss wrt tanh activated output from layer-1
                derivative_loss_wrt_out1=np.array([[ np.sum(np.multiply(derivative_loss_wrt_a2[i,:],self.weights[j,:])) for j in range(out1.shape[1])] for i in range(out1.shape[0])])
                
                #Calculating the gradient of loss wrt output from layer-1
                derivative_loss_wrt_a1=derivative_loss_wrt_out1*(1-out1**2)
                
                #Performing the back propogation on layer-1
                self.layer1.backward(derivative_loss_wrt_a1)
                
                #Updating the weights of fully connnected layer
                self.weights-=0.001*derivative_loss_wrt_weights
            
            #Calculating the Softmax cross entropy loss
            a1=self.layer1.forward(X_train)  
            out1=np.tanh(a1)
            a2=np.dot(out1,self.weights)
            out2=np.exp(a2)/np.sum(np.exp(a2),axis=1).reshape(a2.shape[0],1)
            print("Cost after iteration",iter,':',np.sum(-y_train*np.log(out2)))
    
    #Funtion the predicts the output for input x
    def predict(self,X):
        a1=self.layer1.forward(X)  
        out1=np.tanh(a1)
        a2=np.dot(out1,self.weights)
        out2=np.exp(a2)/np.sum(np.exp(a2),axis=1).reshape(a2.shape[0],1)
        final=self.encode(out2)
        return final
    
    #Function that makes the maximum value as 1 and the rest entries as 0
    def encode(self,a):
        final_output=np.zeros_like(a)
        final_output[np.arange(len(a)), a.argmax(1)] = 1
        return final_output

#Creating the model for MNIST data
model=mnist_CNN()
model.fit(X_train,y_train)

Cost after iteration 0 : 331.507227394308
Cost after iteration 1 : 264.5911748085799
Cost after iteration 2 : 160.21177606497127
Cost after iteration 3 : 84.56189227225194
Cost after iteration 4 : 63.01580348901906
Cost after iteration 5 : 43.41801521075486
Cost after iteration 6 : 34.200279789502225
Cost after iteration 7 : 28.158445886634894
Cost after iteration 8 : 24.714660597736973
Cost after iteration 9 : 21.38938560824486
Cost after iteration 10 : 19.157166732022038
Cost after iteration 11 : 17.016685085150616
Cost after iteration 12 : 15.644549901932331
Cost after iteration 13 : 14.572888805051956
Cost after iteration 14 : 13.54080533376064


## Using the trained model to get the output for test data

In [5]:
#Predicting the output for test data
y_pred=model.predict(X_test)
y_pred

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

## Accuracy

In [6]:
#Calculating the Accuracy
count=0
for i in range(len(y_test)):
    if np.argmax(y_test[i])==np.argmax(y_pred[i]):
        count+=1
print("ACCURACY:",count*100/len(X_test))

ACCURACY: 91.52
