## IMPORTING LIBRARIES

In [45]:
# IMPORTING LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## READING THE TRAINING DATASET

In [46]:
# READING THE TRAINING DATA AND SAMPLES FROM CSV

columns=[]
for i in range (0,784):
    columns.append(i)
    
train_data=pd.read_csv("/home/amith/Desktop/ECE657_Ass/train_data.csv",names=columns)

labels_data=pd.read_csv("/home/amith/Desktop/ECE657_Ass/train_labels.csv",names=["class1","class 2","class 3","class 4"])

In [47]:
# CONCATINATING FOR SHUFFLING THE DATASET

df= [train_data,labels_data]
con_df=pd.concat(df,axis=1)


# SHUFFLING THE DATASET

shuffled_data=con_df.sample(frac=1).reset_index(drop=True)


# TRAIN AND TEST SET SPLIT

X_train=shuffled_data.iloc[0:20000,0:784].values
Y_train=shuffled_data.iloc[0:20000,784:].values

X_val=shuffled_data.iloc[20000:,0:784].values
Y_val=shuffled_data.iloc[20000:,784:].values



## TRAINING THE DATASET WITHOUT LIBRARIES

In [48]:
# Random Weights Initializer Function

def rand_weights():
    tot_layers=[784,112,4]
    
    # Random First layer weights and bias (Dimensions of weights would be (tot_layers[0],tot_layers[1]))
    W1= np.random.randn(tot_layers[0],tot_layers[1])
    b1= np.random.randn(tot_layers[1])
    
    # Random last layer weights and bias (Dimensions of weights would be (tot_layers[1],tot_layers[2]))
    W2= np.random.randn(tot_layers[1],tot_layers[2])
    b2= np.random.randn(tot_layers[2])
    
    return W1,b1,W2,b2
    

In [49]:
# ACTIVATION FUNCTIONS

# FIRST LAYER (HIDDEN LAYER ACTIVATION FUNCTION)
def sigmoid(data):
    return (1/(1+np.exp(-data)))

# OUTPUT LAYER ACTIVATION FUNCTION
def softmax(data):
    exponential = np.exp(data)
    return exponential / exponential.sum(axis=1, keepdims=True)

In [1]:
# LOSS CALCULATION

# cross-entropy loss function
''' Since the output was one hot encoded and since it was a multi class classification problem cross-entropy loss
loss function is used'''

def cal_loss(y,y_hat):
    n=len(y)
    return 1/n * np.sum(-y * np.log(y_hat))


In [18]:
# predict, and round the predict to get one hot encoded values and Accuracy function
def round_y_pred(y_pred):
    ''' 
    Taking the index of the highest probablity and changing that index to 1 and rest to zero. This ensure that
    y_predict is one hot encoded values
    '''
    zeros_ypred=np.zeros((len(y_pred),4)) 
    ypred_index=np.argmax(y_pred,axis=1)
    
    for i in range (len(y_pred)):
        zeros_ypred[i][ypred_index[i]]=1
    
    return zeros_ypred
    

def predict(X_test,W1,b1,W2,b2):
    '''
    Predicts on a test data
    '''
    Z1 = np.dot(X_test, W1) + b1
    A1 = sigmoid(Z1)
    Z2 = np.dot(A1, W2) + b2
    pred = softmax(Z2)
    
    y_pred= round_y_pred(pred)
    return y_pred


def accuracy(y_true, y_pred_1):
    ''' 
    Accuracy function as given by instructors
    '''
    y_pred=round_y_pred(y_pred_1)
    if not (len(y_true) == len(y_pred)):
        print('Size of predicted and true labels not equal.')
        return 0.0

    corr = 0
    for i in range(0,len(y_true)):
        corr += 1 if (y_true[i] == y_pred[i]).all() else 0

    return corr/len(y_true)

In [19]:
# TRAINING FUNCTION

def train(lr,epoch,x_train,y_train,W1,b1,W2,b2):
    
    learning_rate= lr
    error_loss=[]
    
    for i in range (epoch):
        
        # Forward_propogation

        Z1=np.dot(x_train,W1)+b1
        A1=sigmoid(Z1)
        Z2=np.dot(A1,W2)+b2
        y_hat= softmax(Z2)

        # LOSS CALCULATION AFTER EACH FORWARD PASS AND CALCULATES TRAINING ACCURACY WITH ONE HOT ENCODED VALUES
        
        loss= cal_loss(y_train,y_hat)
        train_acc= accuracy(y_train,y_hat)
        print("EPOCH : ",(i+1)," LOSS_VALUE : ",loss, " TRAINING ACCURACY: ",train_acc)
        error_loss.append(loss)
        
         # BACKWARD PROPOGATION

        ''' LAST LAYER UPDATE EXPLINATION
        
         dl/dw2 = dl/yhat * dyhat/dz2 *dz2/dw2  ......1
         dl/dz2 =  dl/yhat * dyhat/dz2 ......... 2

         where
         dl/yhat = derivation of loss function
         dyhat/dz2 = derivation of activation with input as y_hat (a2)

         Z2 = A1 * W2
         dz2/dw2 = A1 .......... 3

         Substituting 2 and 3 in 1
         dl/dw2= dl/dz2 * A1  (dimension= 112 *4)

         Bias B2
         dl/db2=dl/dyhat * dyhat/dZ2 * dz2/dB2 ...........4
         Z2= A1.W2 +B2
         dZ2/dB2= 1  ..........5

         Substituting 5 and 2 in eq4
         dl/db2=dl/dz2

         '''
        # Weight W2

        dl_dz2 =  y_hat - y_train        #DL/DZ2= dl_y_hat * dy_hat_sigmoid
        dz2_dw2 = A1
        dl_dw2=  np.dot (dz2_dw2.T,dl_dz2)
        
         # Bias b2
            
        dl_db2= dl_dz2


        ''' 
         FIRST LAYER UPDATE EXPLINATION
         dl/dw1 = dl/yhat * dyhat/dz2 *dz2/da1 * da1/dz1 * dz1/dw1  ......1
         dl/dz2 =  dl/yhat * dyhat/dz2 ......... 2
         Z2 = A1 * W2
         dz2/da1= W2 .......3

         Z1=X_train * W1
         dz1/dw1= X_train .......... 4

         da1/dz1= derivation of activation with input as z1 .... 5


         Substituting 2 , 3 , 4 in 1
         dl/dw1= dl/dz2 * W2 *  da1/dz1 * X_train (dimension= 784 * 112)

         BIAS B1
         dl/db1 = dl/yhat * dyhat/dz2 * dz2/da1 * da1/dz1 * dz1/db1 ...7
         dz1/db1 =1.... 6

         substituting 5,2,3,6 in 7

         dl/db1= dl/z2 * W2 *  da1/dz1


         '''
        # Weight W1
        
        dz2_da1 = W2
        dl_a1= np.dot(dl_dz2 , dz2_da1.T)  # dimension= ((20000 , 4)*( 4 *112)) = 20000,112
        da1_dz1= sigmoid(Z1) * (1-sigmoid(Z1)) # derivation of activation with input as z1 dim= (20000,112)
        dl_dw1=np.dot(X_train.T,da1_dz1* dl_a1 ) # change in weights wrt W1 dim_xtrain= 20000,784 fin dim=784*112

        #Bias b1

        dl_db1= dl_a1 * da1_dz1



         # WEIGHTS UPDATE AFTER EACH EPOCH

        W2= W2 - learning_rate* dl_dw2
        b2= b2 - learning_rate* dl_db2.sum(axis=0)
        W1= W1 - learning_rate* dl_dw1
        b1= b1 - learning_rate* dl_db1.sum(axis=0)


    
    return error_loss,W1,b1,W2,b2
    
    
    
    


In [50]:
# To train run the following
W1,b1,W2,b2= rand_weights()
loss_list,Weights1,bias1,Weights2,bias2= train(0.001,500,X_train,Y_train,W1,b1,W2,b2)


EPOCH :  1  LOSS_VALUE :  7.3225894253460195  TRAINING ACCURACY:  0.34775
EPOCH :  2  LOSS_VALUE :  164.62112665903607  TRAINING ACCURACY:  0.40095
EPOCH :  3  LOSS_VALUE :  59.121902563628325  TRAINING ACCURACY:  0.2398
EPOCH :  4  LOSS_VALUE :  58.95705926085355  TRAINING ACCURACY:  0.2488
EPOCH :  5  LOSS_VALUE :  59.25805764396397  TRAINING ACCURACY:  0.2391
EPOCH :  6  LOSS_VALUE :  54.14273286132177  TRAINING ACCURACY:  0.2723
EPOCH :  7  LOSS_VALUE :  13.089579149570044  TRAINING ACCURACY:  0.23985
EPOCH :  8  LOSS_VALUE :  22.937658197316583  TRAINING ACCURACY:  0.32815
EPOCH :  9  LOSS_VALUE :  23.13705803311059  TRAINING ACCURACY:  0.50215
EPOCH :  10  LOSS_VALUE :  13.422152499090263  TRAINING ACCURACY:  0.23915
EPOCH :  11  LOSS_VALUE :  24.545493003678537  TRAINING ACCURACY:  0.2413
EPOCH :  12  LOSS_VALUE :  31.894778246949244  TRAINING ACCURACY:  0.31125
EPOCH :  13  LOSS_VALUE :  21.04952479358622  TRAINING ACCURACY:  0.2488
EPOCH :  14  LOSS_VALUE :  26.561293783042714

EPOCH :  112  LOSS_VALUE :  0.24181462280370752  TRAINING ACCURACY:  0.9425
EPOCH :  113  LOSS_VALUE :  0.24941714532876955  TRAINING ACCURACY:  0.95175
EPOCH :  114  LOSS_VALUE :  0.24000323290489134  TRAINING ACCURACY:  0.94275
EPOCH :  115  LOSS_VALUE :  0.24754796845195465  TRAINING ACCURACY:  0.95205
EPOCH :  116  LOSS_VALUE :  0.23825104549672232  TRAINING ACCURACY:  0.94325
EPOCH :  117  LOSS_VALUE :  0.2454535486890405  TRAINING ACCURACY:  0.95235
EPOCH :  118  LOSS_VALUE :  0.2360926004030019  TRAINING ACCURACY:  0.94375
EPOCH :  119  LOSS_VALUE :  0.24232040050664383  TRAINING ACCURACY:  0.95295
EPOCH :  120  LOSS_VALUE :  0.23298661369166163  TRAINING ACCURACY:  0.9444
EPOCH :  121  LOSS_VALUE :  0.23815860256737234  TRAINING ACCURACY:  0.9531
EPOCH :  122  LOSS_VALUE :  0.2292956551364871  TRAINING ACCURACY:  0.945
EPOCH :  123  LOSS_VALUE :  0.23385154655635698  TRAINING ACCURACY:  0.9535
EPOCH :  124  LOSS_VALUE :  0.22572752414859748  TRAINING ACCURACY:  0.9455
EPOCH :  

EPOCH :  220  LOSS_VALUE :  0.15182595456970552  TRAINING ACCURACY:  0.95965
EPOCH :  221  LOSS_VALUE :  0.1510807715301424  TRAINING ACCURACY:  0.96105
EPOCH :  222  LOSS_VALUE :  0.15082253124536785  TRAINING ACCURACY:  0.9598
EPOCH :  223  LOSS_VALUE :  0.14996392053199817  TRAINING ACCURACY:  0.96115
EPOCH :  224  LOSS_VALUE :  0.14978389803577355  TRAINING ACCURACY:  0.95995
EPOCH :  225  LOSS_VALUE :  0.14909237952943094  TRAINING ACCURACY:  0.96095
EPOCH :  226  LOSS_VALUE :  0.1503043024718031  TRAINING ACCURACY:  0.9598
EPOCH :  227  LOSS_VALUE :  0.15203320379200513  TRAINING ACCURACY:  0.9601
EPOCH :  228  LOSS_VALUE :  0.1484508889358087  TRAINING ACCURACY:  0.96015
EPOCH :  229  LOSS_VALUE :  0.146763141105444  TRAINING ACCURACY:  0.9617
EPOCH :  230  LOSS_VALUE :  0.14649765592835934  TRAINING ACCURACY:  0.96045
EPOCH :  231  LOSS_VALUE :  0.1454214450502132  TRAINING ACCURACY:  0.96195
EPOCH :  232  LOSS_VALUE :  0.1451424042478135  TRAINING ACCURACY:  0.9607
EPOCH :  23

EPOCH :  328  LOSS_VALUE :  0.10750222542718475  TRAINING ACCURACY:  0.9682
EPOCH :  329  LOSS_VALUE :  0.10713512109259864  TRAINING ACCURACY:  0.96875
EPOCH :  330  LOSS_VALUE :  0.10693793835386312  TRAINING ACCURACY:  0.96825
EPOCH :  331  LOSS_VALUE :  0.10657560034973185  TRAINING ACCURACY:  0.9687
EPOCH :  332  LOSS_VALUE :  0.10637895231442945  TRAINING ACCURACY:  0.9684
EPOCH :  333  LOSS_VALUE :  0.10601996912799369  TRAINING ACCURACY:  0.9687
EPOCH :  334  LOSS_VALUE :  0.1058214661708786  TRAINING ACCURACY:  0.96855
EPOCH :  335  LOSS_VALUE :  0.10546429079357822  TRAINING ACCURACY:  0.96875
EPOCH :  336  LOSS_VALUE :  0.1052615720375894  TRAINING ACCURACY:  0.96855
EPOCH :  337  LOSS_VALUE :  0.10490415862719045  TRAINING ACCURACY:  0.9689
EPOCH :  338  LOSS_VALUE :  0.10469444453402846  TRAINING ACCURACY:  0.96855
EPOCH :  339  LOSS_VALUE :  0.10433388054727132  TRAINING ACCURACY:  0.96905
EPOCH :  340  LOSS_VALUE :  0.10411382116548544  TRAINING ACCURACY:  0.9687
EPOCH :

EPOCH :  436  LOSS_VALUE :  0.08275118715439012  TRAINING ACCURACY:  0.9734
EPOCH :  437  LOSS_VALUE :  0.08260909563090528  TRAINING ACCURACY:  0.97325
EPOCH :  438  LOSS_VALUE :  0.08247008442726574  TRAINING ACCURACY:  0.97385
EPOCH :  439  LOSS_VALUE :  0.08233285462734131  TRAINING ACCURACY:  0.97325
EPOCH :  440  LOSS_VALUE :  0.08219748617137282  TRAINING ACCURACY:  0.974
EPOCH :  441  LOSS_VALUE :  0.08206333875871147  TRAINING ACCURACY:  0.9734
EPOCH :  442  LOSS_VALUE :  0.08193068580189843  TRAINING ACCURACY:  0.97405
EPOCH :  443  LOSS_VALUE :  0.08179922696365541  TRAINING ACCURACY:  0.9736
EPOCH :  444  LOSS_VALUE :  0.08166925812241814  TRAINING ACCURACY:  0.97425
EPOCH :  445  LOSS_VALUE :  0.08154059039709538  TRAINING ACCURACY:  0.97365
EPOCH :  446  LOSS_VALUE :  0.0814134352332523  TRAINING ACCURACY:  0.9743
EPOCH :  447  LOSS_VALUE :  0.08128761981024075  TRAINING ACCURACY:  0.97365
EPOCH :  448  LOSS_VALUE :  0.08116325815493619  TRAINING ACCURACY:  0.97455
EPOCH 

In [59]:
# Testing On validation set

y_pred=predict(X_val,Weights1,bias1,Weights2,bias2)
acc=accuracy(Y_val,y_pred)

print(acc)

0.9728649558266723


## STORING THE WEIGHTS IN PICKLE FORMAT

In [55]:
import pickle as pkl

pkl.dump({"W1": Weights1, "b1": bias1, "W2": Weights2, "b2": bias2}, open("model_weights.pkl", "wb"))
print("saved data to ")

saved data to 


## READING THE STORED WEIGHTS

In [56]:
pickle_dict= pkl.load(open("model_weights.pkl", "rb"))

## REPORT

1) Softmax is used as final layer activation function. 

softmax or sigmoid can be used as final layer activation function for multiclass classifier but softmax is commonly used when the probabilities produced for each class is dependant and the sum of probablities corresponds to 1. (i.e) more prefered when there is only one ouput from multiclass which is similar to one hot encoded output.Hence softmax is used for the final layer activation function

2) Sigmoid used as activation function in the middle layer.

Sigmoid may not work properly as it might lead to vanishing gradient when the number of hidden layers are more but since here there is just one hidden layer sigmoid activation is chosen.

3) Categorical cross Loss function is used as cost function.

 Since it is a multiclass classification problem espicially with outputs as categorical (i.e) one hot encoded, categorical cross entropy is used.
 
4) Hidden Neurons selection

Since the dataset is fairly small with less samples and the features being easily learnt when trained, I have added hidden neuron as 112 , which can itself learn the essential feature and classify the output.

5) Learning rate and train_test_split

Since samples in the dataset was comparitively less and more samples to training would help the model learn, I have choosen 20000 samples for training and 4754 samples for validation.I varied the learning rate, and found the learning rate with 0.001 was optimal and produced a validation accuracy of over 97%.


