In [1]:
import torch
import torchvision
from torchvision.datasets import MNIST,EMNIST
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt



In [2]:
def softmax(Z):
    return (torch.exp(Z)/torch.sum(torch.exp(Z),dim=1,keepdim=True))

def categorical_cross_entropy(Y_enc_train,Y_hat):
    return torch.sum((-Y_enc_train*torch.log(Y_hat)),dim=1,keepdim=True).mean()
 

In [3]:
train_edata=EMNIST(root='emnist_dataset/train/',split='balanced',train=True,download=True,transform=ToTensor())
test_edata=EMNIST(root='emnist_dataset/test/',split='balanced',train=False,download=True,transform=ToTensor())
train_edata,test_edata

(Dataset EMNIST
     Number of datapoints: 112800
     Root location: emnist_dataset/train/
     Split: Train
     StandardTransform
 Transform: ToTensor(),
 Dataset EMNIST
     Number of datapoints: 18800
     Root location: emnist_dataset/test/
     Split: Test
     StandardTransform
 Transform: ToTensor())

In [4]:
#train set
X_etrain=torch.flatten(train_edata.data,1,2).to(torch.float32)
Y_etrain=train_edata.targets.reshape(X_etrain.shape[0],1).to(torch.int8)

X_etrain.shape,Y_etrain.shape

# test set
X_etest=torch.flatten(test_edata.data,1,2).to(torch.float32)
Y_etest=test_edata.targets.reshape(X_etest.shape[0],1).to(torch.int8)

X_etrain.shape,Y_etrain.shape,X_etest.shape,Y_etest.shape

(torch.Size([112800, 784]),
 torch.Size([112800, 1]),
 torch.Size([18800, 784]),
 torch.Size([18800, 1]))

In [5]:
# scaling train dataset
for i in range(X_etrain.shape[0]):
    X_etrain[i]=(X_etrain[i]-X_etrain[i].min())/(X_etrain[i].max()-X_etrain[i].min())


# scaling test dataset
for i in range(X_etest.shape[0]):
    X_etest[i]=(X_etest[i]-X_etest[i].min())/(X_etest[i].max()-X_etest[i].min())

In [6]:
## One hot encoding For the labels

Y_enc_etrain=torch.zeros((Y_etrain.shape[0],len(train_edata.classes))).to(torch.int8)

for i in range(Y_etrain.shape[0]):
    Y_enc_etrain[i][Y_etrain[i].item()]=1


Y_enc_etest=torch.zeros((Y_etest.shape[0],len(test_edata.classes))).to(torch.int8)

for i in range(Y_etest.shape[0]):
    Y_enc_etest[i][Y_etest[i].item()]=1


Y_enc_etrain.shape,Y_enc_etest.shape

(torch.Size([112800, 47]), torch.Size([18800, 47]))

In [7]:
num_input=X_etrain.shape[1]
num_hidden=128


# kaiming weight initialization
W1=torch.randn((num_input,num_hidden),dtype=torch.float32,requires_grad=True)*torch.sqrt(torch.tensor(2/num_input))
b1=torch.randn((1,num_hidden),dtype=torch.float32,requires_grad=True)*torch.sqrt(torch.tensor(2/num_input))

W2=torch.randn((num_hidden,Y_enc_etrain.shape[1]),dtype=torch.float32,requires_grad=True)*torch.sqrt(torch.tensor(2/num_hidden))
b2=torch.randn((1,Y_enc_etrain.shape[1]),dtype=torch.float32,requires_grad=True)*torch.sqrt(torch.tensor(2/num_hidden))


print(f"shape- W1: {W1.shape}\
       \nshape- b1: {b1.shape}\
      \nshape- W2: {W2.shape}\
      \nshape- b2: {b2.shape}")

shape- W1: torch.Size([784, 128])       
shape- b1: torch.Size([1, 128])      
shape- W2: torch.Size([128, 47])      
shape- b2: torch.Size([1, 47])


In [8]:
#utitlity function to check manually calculated gradients with that of pytorch autograd
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [9]:

batch_size=32

# a single batch
X_btrain=X_etrain[:batch_size,:]
Y_btrain=Y_enc_etrain[:batch_size,:]

gamma=torch.randn((1,num_hidden),requires_grad=True)
beta=torch.randn((1,num_hidden),requires_grad=True)

X_btrain.shape,Y_btrain.shape



(torch.Size([32, 784]), torch.Size([32, 47]))

In [10]:
loss_list_train=[]

In [11]:

# forward pass
Z1=X_btrain@W1 + b1
A1=torch.relu(Z1)

A1_mean=torch.sum(A1,dim=0,keepdim=True)/A1.shape[0]
A1_std=(torch.sqrt(torch.sum(((A1-A1_mean)**2),dim=0,keepdim=True)/A1.shape[0]))+1e-10 # adding small term with std to eliminate dividing by 0 at A1_norm
A1_norm=(A1-A1_mean+1e-10)/(A1_std)   

# (1/(A1.shape[0]**(1/2)))*(0.5/((torch.sum(((A1-A1_mean)**2),dim=0)))**(1/2))*-2*(A1-A1_mean)

# batch normalization
A2=A1_norm*gamma+beta

Z2=A2@W2 + b2
Y_ehat=softmax(Z2)

Loss=categorical_cross_entropy(Y_btrain,Y_ehat)
# Loss=torch.sum((-Y_enc_etrain*torch.log(Y_ehat)),dim=1,keepdim=True).mean()

loss_list_train.append(Loss.item())

# # backpropgation
dL_dZ2=(Y_ehat-Y_btrain)/X_btrain.shape[0]
dZ2_dW2=A2

dL_dW2=dZ2_dW2.T@dL_dZ2
dL_db2=torch.sum(dL_dZ2,dim=0,keepdim=True)


dZ2_dA2=W2
dL_dA2= dL_dZ2 @ dZ2_dA2.T

dA2_dgamma=A1_norm
dL_dgamma=torch.sum(dL_dA2*dA2_dgamma,dim=0,keepdim=True)
dL_dbeta=torch.sum(dL_dA2,dim=0,keepdim=True)

dA2_dA1_norm=gamma  #---> shape (1,128)
dL_dA1_norm=dL_dA2*dA2_dA1_norm    # ----> (32,128)
dA1_norm_dA1_std= -(A1-A1_mean)*(A1_std**-2)
dA1_norm_dA1_mean=-1/A1_std
dL_dA1_std=torch.sum(dL_dA1_norm*dA1_norm_dA1_std,dim=0,keepdim=True)           
dL_dA1_mean=torch.sum(dL_dA1_norm*dA1_norm_dA1_mean,dim=0,keepdim=True) 

dA1_std_dA1_mean=(1/(A1.shape[0]**(1/2)))*(0.5/((torch.sum(((A1-A1_mean)**2),dim=0,keepdim=True)))**(1/2))*-2*(A1-A1_mean)
dL_dA1_mean+=torch.sum(dL_dA1_std*dA1_std_dA1_mean,dim=0,keepdim=True)

# dA1_std_dA1=(1/(A1.shape[0]**(1/2)))*(0.5/((torch.sum(((A1-A1_mean+1e-16)**2),dim=0,keepdim=True)))**(1/2))*2*(A1-A1_mean+1e-16)
# dL_dA1=dL_dA1_std*dA1_std_dA1

# dA1_mean_dA1=torch.ones_like(A1)/A1.shape[0]
# dL_dA1+=dL_dA1_mean*dA1_mean_dA1

# dA1_dZ1=torch.where((Z1<0),0, 1)

# dL_dZ1=(dA1_dZ1 * dL_dA1)

# dZ1_dW1=X_etrain

# dL_dW1=dZ1_dW1.T @ dL_dZ1
# dL_db1=torch.sum(dL_dZ1,dim=0,keepdim=True)

# # weights and biases optimization
# W2-=0.9*dL_dW2
# b2-=0.9*dL_db2
# W1-=0.9*dL_dW1
# b1-=0.9*dL_db1



In [12]:
for each in [Z2,A2,A1_norm,gamma,beta,A1_std,A1_mean,W1,W2,b1,b2,A1,Z1]:
    each.retain_grad()

In [13]:
Loss.backward()

In [14]:
A1_mean.grad

tensor([[-2.1046e-01, -9.1469e-03, -8.4152e-02, -2.0692e-02,  4.9204e-01,
         -8.8017e-02,  9.2528e-02,  1.1139e-01,  1.4220e-03, -9.3558e-02,
         -1.6063e-01, -9.4405e-03,  3.3910e-01, -8.2059e-02,  9.0849e-05,
          1.5636e+00, -4.5657e-02,  6.2388e-02, -9.3670e-02, -7.2527e-05,
         -1.4325e-02, -1.4967e-02, -3.2541e-01,  2.3389e-02,  1.5310e-01,
          4.3715e-01, -2.3531e-02, -6.8908e-02,  3.0556e-02, -4.1656e-02,
          9.4008e-02, -1.1572e-01,  1.4871e-01, -2.1040e-01, -1.0840e-02,
          3.3909e-01,  1.8660e-01, -1.0177e-01,  1.4038e-01, -8.2930e-03,
          6.0822e-02, -3.6075e-02,  3.6233e-01, -7.3473e-04, -1.1976e-01,
          3.1110e-01, -2.0340e-01,  8.4916e-03,         nan,  2.5956e-01,
                 nan, -1.8628e-01, -1.5665e-02, -2.5812e-02, -1.4362e-01,
         -6.5050e-02, -2.4336e-02,  4.1878e-02, -5.0967e-01,  1.2634e-02,
          4.8483e-04, -8.2025e-03,  3.3736e+00,  4.5322e-02, -2.7738e-01,
          2.6272e-01, -4.8051e-03, -1.

In [15]:
cmp("dZ2", dL_dZ2, Z2)
cmp("dW2", dL_dW2, W2)
cmp("db2", dL_db2, b2)
cmp("dA2", dL_dA2, A2)
cmp("dgamma", dL_dgamma, gamma)
cmp("dbeta", dL_dbeta, beta)
cmp("dA1_norm", dL_dA1_norm, A1_norm)
cmp("dA1_std", dL_dA1_std, A1_std)
cmp("dA1_mean", dL_dA1_mean, A1_mean)
# cmp("dA1", dL_dA1, A1)



dZ2             | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
dW2             | exact: False | approximate: True  | maxdiff: 5.960464477539063e-08
db2             | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
dA2             | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
dgamma          | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
dbeta           | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08
dA1_norm        | exact: False | approximate: True  | maxdiff: 2.7939677238464355e-09
dA1_std         | exact: False | approximate: False | maxdiff: 334385888.0
dA1_mean        | exact: False | approximate: False | maxdiff: nan
