In [1]:
import torch

In [2]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision
import os

In [3]:
transform = transforms.Compose([#transforms.Resize((256,256)),  
                                transforms.Grayscale(),		# the code transforms.Graysclae() is for changing the size [3,100,100] to [1, 100, 100] (notice : [channel, height, width] )
                                transforms.ToTensor(),])


#train_data_path = 'relative path of training data set'
train_data_path = '../../horse-or-human/train'
trainset = torchvision.datasets.ImageFolder(root=train_data_path, transform=transform)
# change the valuse of batch_size, num_workers for your program
# if shuffle=True, the data reshuffled at every epoch 
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1500, shuffle=False, num_workers=1)  


validation_data_path = '../../horse-or-human/validation'
valset = torchvision.datasets.ImageFolder(root=validation_data_path, transform=transform)
# change the valuse of batch_size, num_workers for your program
valloader = torch.utils.data.DataLoader(valset, batch_size=1500, shuffle=False, num_workers=1)  



 # since batch size is larger than number of data , It works well.
for i, data in enumerate(trainloader):
    train_inputs, train_labels = data
    train_inputs = train_inputs.view(train_inputs.shape[2]*train_inputs.shape[3],train_inputs.shape[0])
for i, data in enumerate(valloader):
    test_inputs, test_labels = data
    test_inputs = test_inputs.view(test_inputs.shape[2]*test_inputs.shape[3],test_inputs.shape[0])
    
Intercept_test = torch.ones(1,test_inputs.shape[1])
Intercept_train = torch.ones(1,train_inputs.shape[1])

test_inputs = torch.cat([test_inputs,Intercept_test],dim = 0)
train_inputs = torch.cat([train_inputs,Intercept_train],dim = 0)

print(train_inputs.shape)


torch.Size([10001, 1027])


# Network architecture

## First layer

$$Z^{[1]} = W^{[1]}X + b^{[1]}$$

$$A^{[1]} = g(Z^{[1]})$$

where $X$ denotes the input data ( images : 10000 *numof images ) and size of $W^[1]$ is (464, 10000) . 


## second layer

$$Z^{[2]} = W^{[2]}A^{[1]} + b^{[2]}$$

$$A^{[2]} = g(Z^{[2]})$$

where size of $W^[2]$ is (21, 464). 

## Third layer

$$Z^{[3]} = W^{[3]}A^{[2]} + b^{[3]}$$

$$A^{[3]} = g(Z^{[3]})$$

where size of $W^[2]$ is (1, 21). 

## Activation 

g is the sigmoid function.

## Loss

let $n$ denotes the number of the images.
$$ Loss = \sum_{i=1}^{n}{f_i} + {{\lambda} \over {2}} F $$

where $F$ denotes the sum of Frobenius Norm of $W^{[i]} \quad i=1,2,3$ based on $L^2_2$norm.


In [4]:
import torch.nn.functional as F


class network_updater:
    def __init__(self,num_node_1,num_node_2 ,regular_lambda): 
        num_node_0 = 10000
        num_node_3 = 1 
        self.weight_1 = torch.rand(num_node_1,num_node_0+1)*0.01
        
        
        self.weight_2 = torch.rand(num_node_2,num_node_1 +1 )*0.01
        
        
        self.weight_3 = torch.rand(num_node_3,num_node_2+1)*0.01
     
        self.regular_lambda = regular_lambda
        self.N1 = 0
        self.N2 = 0
        self.N3 = 0

        
   
    def norm(self,weight_cat): 
        weight = weight_cat[:,0:-1]
        return weight.norm()
    
    def y_hat(self,images):# images with intercept
        num_training_data = images.shape[1] 
        Intercept_hidden1 = torch.ones(1,num_training_data)
        Intercept_hidden2 = torch.ones(1,num_training_data)
        
        hidden1 = torch.sigmoid(torch.mm(self.weight_1,images))
        
        hidden1 = torch.cat([hidden1,Intercept_hidden1],dim = 0)
        
        hidden2 = torch.sigmoid(torch.mm(self.weight_2,hidden1))
        
        hidden2 = torch.cat([hidden2,Intercept_hidden2],dim = 0)
        
        y_hat =  torch.sigmoid(torch.mm(self.weight_3,hidden2))
        return y_hat ,hidden1 , hidden2
    

    def loss_function(self,images,label):
        
        Num = images.shape[1]
        y_hat,hidden1,hidden2 = self.y_hat(images)
        y = label.view(-1,1)
        y=y.float()
        # loss = AVERAGE of  ( -ylog(yhat) -(1-y)log(1-yhat) ) + Frobenius norms of Weights
        N_CrossEntropy = -torch.mm(torch.log(y_hat),y)-torch.mm(torch.log(1-y_hat),1-y)
        self.N1 = self.norm(self.weight_1)# b's norm is deleted in this function (self.norm())
        self.N2 = self.norm(self.weight_2)
        self.N3 = self.norm(self.weight_3)
        Loss = N_CrossEntropy/Num + (self.N1+ self.N2 + self.N3)*(self.regular_lambda/2)
        Dif = (y_hat - y.T)
        
        D = [Dif,hidden1,hidden2] 
        return float(Loss) , D   
    
    def gradient_function (self,D,images):
        Num = images.shape[1]
        Dif,hidden1,hidden2 = D
        D_3 = Dif
        
        # NOTATION: 
        # ai= g(zi) where i=1,2,3
        # z1= matrix Multiprication of (W1,input) + b1, ... ,a3=yhat , 
        # backpropagation is relevelant to gradient of the cross entropy (propagation)
        # D_i = gradient respect to zi 
        # note that sigmoid(z3) = a3 = yhat. from derivation of the cross entropy with sigmoid,
        # we know that D_3 = del Cost/ del a3 * del sig(z3)/ del z3 = (yhat -y / Num) 
        # where y is the form of (1x Num)
        # L0=10000 , L1, L2,L3=1 is number of node in each layer . 
        # ** L0 = 100x100=10000 .since input images are all Image of 100x100 pixels. 
        
        # suppose 1027 is number of training data (=NUM) (this alg is running in just one batch)
        # note that: (W1) = L1xL0, (W2) = L2xL1 ,(W3) = L3xL2 ,
        # (hiddeni) = Lix1027 where i=1,2,3 . hiddeni is relavelant to ai
        # (training data) = L0x10271x1027)(1027xL2)
        gradient_of_weight_3 =  torch.mm(D_3,hidden2.T) *(1/Num)
        W3 = self.weight_3[ :,0:-1]
        back_sig_2 =hidden2*(1-hidden2)#g'(z2) = g(z2)(1-g(z2))(L2x1027)
        back_sig_2 = back_sig_2[ 0:-1 ,:]
        Dif_2 = torch.mm(W3.T,D_3)#gradient respect to a2 (L2x1027)
        D_2 =Dif_2* back_sig_2#gradient respect to z2 (L2x1027)

        gradient_of_weight_2 = torch.mm(D_2,hidden1.T) *(1/Num) # (L2x1027) (1027xL1)
        W2 = self.weight_2[:,0:-1]
        back_sig_1 =hidden1*(1-hidden1)#g'(z1) (L1x1027)
        back_sig_1 = back_sig_1[0:-1,:]
        Dif_1 = torch.mm(W2.T,D_2)#gradient respect to a1 (L1x1027)
        D_1 = Dif_1*back_sig_1 #gradient respect to z1 (L1x1027)
        gradient_of_weight_1 = torch.mm(D_1,images.T)*(1/Num)
        
        regrad1 = (self.regular_lambda/(2*self.N1))*self.weight_1
        regrad1 = regrad1[:,0:-1]
        Intercept_regrad1 = torch.zeros(regrad1.shape[0],1)
        regrad1 = torch.cat([regrad1,Intercept_regrad1],dim = 1)
        
        regrad2 = (self.regular_lambda/(2*self.N2))*self.weight_2
        regrad2 = regrad2[:,0:-1]
        Intercept_regrad2 = torch.zeros(regrad2.shape[0],1)
        regrad2 = torch.cat([regrad2,Intercept_regrad2],dim = 1)
        
        regrad3 = (self.regular_lambda/(2*self.N3))*self.weight_3
        regrad3 = regrad3[:,0:-1]
        Intercept_regrad3 = torch.zeros(regrad3.shape[0],1)
        regrad3 = torch.cat([regrad3,Intercept_regrad3],dim = 1)
        
        gradient_of_weight_1 += regrad1
        gradient_of_weight_2 += regrad2
        gradient_of_weight_3 += regrad3
        
        return [gradient_of_weight_3,
                gradient_of_weight_2,
                gradient_of_weight_1]
        
    

    
   
    def accuracy(self,D_3):
        false_positive = torch.sum(D_3>=0.5) 
        false_negative = torch.sum(D_3<-0.5) 
        total=D_3.shape[1]
        hit = total -false_positive-false_negative
        return  float(hit.float()/total)
        
    def networkupdater(self, G,rate):
        gradient_of_weight_3 ,gradient_of_weight_2,gradient_of_weight_1 =G
        self.weight_1 -= gradient_of_weight_1*rate
        
        self.weight_2 -= gradient_of_weight_2*rate
        
        self.weight_3 -=  gradient_of_weight_3*rate

       

In [None]:
learning_rate = 0.01
max_iterations = 500
threshold = 0.001
train_loss_recoders=[[] for i in range(3)]
train_accuracy_recoders=[[] for i in range(3)]
test_loss_recoders=[[] for i in range(3)]
test_accuracy_recoders=[[] for i in range(3)]
gr_weightL = network_updater(464,21,10) # num_node_1 = 464,num_node_2 =21
gr_weightA = network_updater(464,21,0.87) # num_node_1 = 464,num_node_2 =21
gr_weightS = network_updater(464,21,0.01) # num_node_1 = 464,num_node_2 =21
L=0
A=1
S=2
gr_weight = [gr_weightL,gr_weightA,gr_weightS]
for i in range(max_iterations):
    for j in range(3):
        # get and store loss and accuracy 
        train_loss,train_D = gr_weight[j].loss_function(train_inputs,train_labels)
        test_loss,test_D = gr_weight[j].loss_function(test_inputs,test_labels)

        train_accuracy = gr_weight[j].accuracy(train_D[0])
        test_accuracy = gr_weight[j].accuracy(test_D[0])

        train_loss_recoders[j].append((train_loss))
        train_accuracy_recoders[j].append(train_accuracy)
        test_loss_recoders[j].append((test_loss))
        test_accuracy_recoders[j].append(test_accuracy)

        #caculate gradient and update weight
        gradient = gr_weight[j].gradient_function(train_D, train_inputs)
        gr_weight[j].networkupdater(gradient, learning_rate )
        if j==1 and i%10 == 0:
            print("{} ".format(train_loss),end="")

6.361259937286377 6.3127007484436035 6.273599624633789 6.237100601196289 

# Large lambda case

 ## test accuracy

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(test_accuracy_recoders[L])), test_accuracy_recoders[L])
plt.show()

## training accuracy

In [None]:
plt.plot(range(len(train_accuracy_recoders[L])), train_accuracy_recoders[L])
plt.show()

## test loss

In [None]:
plt.plot(range(len(test_loss_recoders[L])), test_loss_recoders[L])
plt.show()

## training loss

In [None]:
plt.plot(range(len(train_loss_recoders[L])), train_loss_recoders[L])
plt.show()

# Small lambda case

 ## test accuracy

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(test_accuracy_recoders[S])), test_accuracy_recoders[S])
plt.show()

## training accuracy

In [None]:
plt.plot(range(len(train_accuracy_recoders[S])), train_accuracy_recoders[S])
plt.show()

## test loss

In [None]:
plt.plot(range(len(test_loss_recoders[S])), test_loss_recoders[S])
plt.show()

## training loss

In [None]:
plt.plot(range(len(train_loss_recoders[S])), train_loss_recoders[S])
plt.show()

# Approtiate case

 ## test accuracy

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(test_accuracy_recoders[A])), test_accuracy_recoders[A])
plt.show()

## training accuracy

In [None]:
plt.plot(range(len(train_accuracy_recoders[A])), train_accuracy_recoders[A])
plt.show()

## test loss

In [None]:
plt.plot(range(len(test_loss_recoders[A])), test_loss_recoders[A])
plt.show()

## training loss

In [None]:
plt.plot(range(len(train_loss_recoders[A])), train_loss_recoders[A])
plt.show()