# Hyperparameters

In [None]:
import torch
import torch.nn as nn

# training config
NUM_EPOCHS=1
VALID_FREQ=1   # Number of epochs between calculating validation set metrics
INIT_TYPE_LIST=['default', 'normalized', 'orthogonal', 'xavier']  #Supposed initalization schemes: ['default', 'normalized', 'orthogonal', 'xavier']
ACTIVATION_TYPE_LIST=['relu', 'leaky_relu', 'sigmoid', 'tanh', 'softplus', 'softsign', 'maxout'] #Supported activation functions ['relu', 'leaky_relu', 'sigmoid', 'tanh', 'softplus', 'softsign', 'maxout']
criterion = nn.CrossEntropyLoss() # Loss function for network training
LR=0.001

# dataset config
batch_size = 4
generator=torch.Generator().manual_seed(42) # Can be included for reproducability
dataset_name = 'tiny-imagenet'

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split

Using Cuda Device

In [None]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

# Load Dataset


In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

if dataset_name == 'CIFAR-10':
  NUM_CLASSES=10

  trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

  testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
  

  trainset, validset = torch.utils.data.random_split(trainset, 
                                                    [int(len(trainset)*0.8),len(trainset)- 
                                                     int(len(trainset)*0.8)], generator=generator)
  
elif dataset_name == 'tiny-imagenet':
  NUM_CLASSES=200
  
  !wget http://cs231n.stanford.edu/tiny-imagenet-200.zip
  !unzip -qq 'tiny-imagenet-200.zip'
  
  
  totalset = torchvision.datasets.ImageFolder('tiny-imagenet-200/train', 
                                                   transform=transform)
  
  train_counts = [0] * 200
  valid_counts = [0] * 200
  trainset = []
  validset = []
  testset = []
  for item in totalset:
    if train_counts[item[1]] < 350:
      trainset.append(item)
      train_counts[item[1]] += 1
    elif valid_counts[item[1]] < 75:
      validset.append(item)
      valid_counts[item[1]] += 1
    else:
      testset.append(item)

elif dataset_name == 'Caltech101':
  NUM_CLASSES=101
  !gdown https://drive.google.com/uc?id=1DX_XeKHn3yXtZ18DD7qc1wf-Jy5lnhD5
  !unzip -qq '101_ObjectCategories.zip' 

  PATH = '101_ObjectCategories/'

  transform = transforms.Compose(
    [transforms.CenterCrop(256),
     transforms.Resize((64,64)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
  
  totalset = torchvision.datasets.ImageFolder(PATH, transform=transform)

  X, y = zip(*totalset)

  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3, 
                                                    stratify=y)
  X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, 
                                                  test_size = 0.5, 
                                                  stratify=y_val)

  trainset, validset, testset = list(zip(X_train, y_train)), list(zip(X_val, y_val)), list(zip(X_test, y_test))




trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(validset, batch_size=batch_size,
                                          shuffle=False,num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)



# Network Architecture
#### On CIFAR-10, we use the 'Net' class, which is the base structure for our experiments.
#### For tiny-imagenet and Caltech101, we use 'Net64', so that we could process 64x64 images.
#### The main difference lies in the numbers of nodes in the fully connected layers; Net64 contains more neurons in the FC layers.

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    # Helper functions so that the other activation functions 
    # have the same format as maxout
    def relu(self, x, layers):
        return F.relu(layers[0](x))

    def leaky_relu(self, x, layers):
        return F.leaky_relu(layers[0](x))

    def sigmoid(self, x, layers):
        return F.sigmoid(layers[0](x))

    def tanh(self, x, layers):
        return F.tanh(layers[0](x))

    def softsign(self, x, layers):
        return F.softsign(layers[0](x))

    def softplus(self, x, layers):
        return F.softplus(layers[0](x))

    def maxout(self, x, layers):
        max_output = layers[0](x)
        for layer in layers[1:]:
            max_output = torch.maximum(max_output, layer(x))
        return max_output

    def __init__(self, num_classes=10, activation_type='relu'):
        super().__init__()
        self.activation_type = activation_type
        maxout_num_units = 1

        if self.activation_type == 'relu':
            self.activation = self.relu
        elif self.activation_type == 'leaky_relu':
            self.activation = self.leaky_relu
        elif self.activation_type == 'sigmoid':
            self.activation = self.sigmoid
        elif self.activation_type == 'tanh':
            self.activation = self.tanh
        elif self.activation_type == 'softsign':
            self.activation = self.softsign
        elif self.activation_type == 'softplus':
            self.activation = self.softplus
        elif self.activation_type == 'maxout':
            maxout_num_units = 2
            self.activation = self.maxout

        self.convs1 = nn.ModuleList()
        self.convs2 = nn.ModuleList()
        self.convs3 = nn.ModuleList()
        self.convs4 = nn.ModuleList()

        self.fcs1 = nn.ModuleList()
        self.fcs2 = nn.ModuleList()

        for _ in range(maxout_num_units):
            self.convs1.append(nn.Conv2d(3, 8, 5, padding="same"))
            self.convs2.append(nn.Conv2d(8, 8, 5, padding="same"))
            self.convs3.append(nn.Conv2d(8, 16, 3, padding="same"))
            self.convs4.append(nn.Conv2d(16, 16, 3, padding="same"))

            self.fcs1.append(nn.Linear(16 * 8 * 8, 480))
            self.fcs2.append(nn.Linear(480, 320))


        self.pool = nn.MaxPool2d(2, 2)
        self.fc3 = nn.Linear(320, num_classes)

    def forward(self, x):
        x = self.activation(x, self.convs1)
        x = self.pool(self.activation(x, self.convs2))
        x = self.activation(x, self.convs3)
        x = self.pool(self.activation(x, self.convs4))

        x = torch.flatten(x, 1) # flatten all dimensions except batch
        
        x = F.dropout(self.activation(x, self.fcs1), training=self.training)
        x = F.dropout(self.activation(x, self.fcs2), training=self.training)
        x = self.fc3(x)
          
        return x

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class Net64(nn.Module):
    # Helper functions so that the other activation functions 
    # have the same format as maxout
    def relu(self, x, layers):
        return F.relu(layers[0](x))

    def leaky_relu(self, x, layers):
        return F.leaky_relu(layers[0](x))

    def sigmoid(self, x, layers):
        return F.sigmoid(layers[0](x))

    def tanh(self, x, layers):
        return F.tanh(layers[0](x))

    def softsign(self, x, layers):
        return F.softsign(layers[0](x))

    def softplus(self, x, layers):
        return F.softplus(layers[0](x))

    def maxout(self, x, layers):
        max_output = layers[0](x)
        for layer in layers[1:]:
            max_output = torch.maximum(max_output, layer(x))
        return max_output

    def __init__(self, num_classes=10, activation_type='relu'):
        super().__init__()
        self.activation_type = activation_type
        maxout_num_units = 1

        if self.activation_type == 'relu':
            self.activation = self.relu
        elif self.activation_type == 'leaky_relu':
            self.activation = self.leaky_relu
        elif self.activation_type == 'sigmoid':
            self.activation = self.sigmoid
        elif self.activation_type == 'tanh':
            self.activation = self.tanh
        elif self.activation_type == 'softsign':
            self.activation = self.softsign
        elif self.activation_type == 'softplus':
            self.activation = self.softplus
        elif self.activation_type == 'maxout':
            maxout_num_units = 2
            self.activation = self.maxout

        self.convs1 = nn.ModuleList()
        self.convs2 = nn.ModuleList()
        self.convs3 = nn.ModuleList()
        self.convs4 = nn.ModuleList()

        self.fcs1 = nn.ModuleList()
        self.fcs2 = nn.ModuleList()

        for _ in range(maxout_num_units):
            self.convs1.append(nn.Conv2d(3, 8, 5, padding="same"))
            self.convs2.append(nn.Conv2d(8, 8, 5, padding="same"))
            self.convs3.append(nn.Conv2d(8, 16, 3, padding="same"))
            self.convs4.append(nn.Conv2d(16, 16, 3, padding="same"))

            self.fcs1.append(nn.Linear(16 * 16 * 16, 960))
            self.fcs2.append(nn.Linear(960, 380))


        self.pool = nn.MaxPool2d(2, 2)
        self.fc3 = nn.Linear(380, num_classes)

    def forward(self, x):
        x = self.activation(x, self.convs1)
        x = self.pool(self.activation(x, self.convs2))
        x = self.activation(x, self.convs3)
        x = self.pool(self.activation(x, self.convs4))

        x = torch.flatten(x, 1) # flatten all dimensions except batch
        
        x = F.dropout(self.activation(x, self.fcs1), training=self.training)
        x = F.dropout(self.activation(x, self.fcs2), training=self.training)
        x = self.fc3(x)
          
        return x

# Initialization methods


In [None]:
def init_orthogonal(m):
  if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
    torch.nn.init.orthogonal_(m.weight)

def init_normalized(m):
  if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
    torch.nn.init.normal_(m.weight, 0.0, 0.1)

def init_xavier(m):
  if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
    torch.nn.init.xavier_uniform_(m.weight)
    if m.bias is not None:
      torch.nn.init.zeros_(m.bias)

# Train methods


In [None]:
import torch.optim as optim
import copy
import matplotlib.pyplot as plt
import os
import numpy as np


In [None]:
def train(trainloader, criterion, optimizer, net, num_epochs, valid_freq):
  
  best_model_wts = copy.deepcopy(net.state_dict())
  best_valid_loss = float("inf")
  best_epoch = 0

  training_loss=[]
  validation_loss=[]

  for epoch in range(num_epochs):  # loop over the dataset multiple times
      epoch_loss = 0.0
      running_loss = 0.0
      for i, data in enumerate(trainloader, 0):
          # get the inputs; data is a list of [inputs, labels]
          inputs, labels = data
          inputs, labels = inputs.to(device), labels.to(device)

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          running_loss += loss.item()
          epoch_loss += loss.item()
          if i % 2000 == 1999:    # print every 2000 mini-batches
              print('Epoch %d, Batch %5d, Batch loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / 2000))
              running_loss = 0.0
      
      training_loss.append(epoch_loss/len(trainloader))
      print('Epoch %d, Training loss: %.3f' %
            (epoch + 1, training_loss[-1]))


      if (epoch%valid_freq == 0):
        epoch_loss = 0.0

        with torch.no_grad():
          for data in validloader:
              images, labels = data
              images, labels = images.to(device), labels.to(device)
              # calculate outputs by running images through the network 
              outputs = net(images)
              loss = criterion(outputs, labels)

              epoch_loss += loss.item()

        validation_loss.append(epoch_loss/len(validloader))
        print('Epoch %d, Validation loss: %.3f' %
        (epoch + 1, validation_loss[-1]))

        if validation_loss[-1] < best_valid_loss:
          best_model_wts = copy.deepcopy(net.state_dict())
          best_valid_loss = validation_loss[-1]
          best_epoch = epoch

  print('Finished Training')


  net.load_state_dict(best_model_wts)
  print("model returned at epoch =", (best_epoch+1))

  return training_loss, validation_loss


In [None]:

for init_type in INIT_TYPE_LIST:
  for activation_type in ACTIVATION_TYPE_LIST:
    
    if dataset_name == 'CIFAR-10':
      net = Net(NUM_CLASSES, activation_type)
    elif dataset_name == 'tiny-imagenet' or dataset_name == 'Caltech101':
      net = Net64(NUM_CLASSES, activation_type)
    
    optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9)
    if init_type == 'normalized':
      net.apply(init_normalized)
    elif init_type == 'orthogonal': 
      net.apply(init_orthogonal)
    elif init_type == 'xavier':
      net.apply(init_xavier)
    net.to(device)
    training_loss, validation_loss = train(trainloader, criterion, optimizer, net, NUM_EPOCHS, VALID_FREQ)

    if not os.path.isdir('Results/'+dataset_name+'/'):
      os.mkdir('Results/')
      os.mkdir('Results/'+dataset_name+'/')

    np.save('Results/'+dataset_name+'/training_loss_'+init_type + '_'+activation_type+'.npy', np.array(training_loss))
    np.save('Results/'+dataset_name+'/validation_loss_'+init_type + '_'+activation_type+'.npy', np.array(validation_loss))

    plt.figure()
    plt.plot(range(1, len(training_loss)+1), training_loss)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training loss")
    plt.savefig('Results/'+dataset_name+'/Training_loss_'+init_type + '_'+activation_type+'.png', bbox_inches='tight')
    
    plt.figure()
    plt.plot(range(1, VALID_FREQ*len(validation_loss)+1, VALID_FREQ), validation_loss)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Validation Loss")
    plt.savefig('Results/'+dataset_name+'/Validation_loss_'+init_type + '_'+activation_type+'.png', bbox_inches='tight')

    if not os.path.isdir('Models/'):
      os.mkdir('Models/')

    torch.save(net.state_dict(), 'Models/'+dataset_name+'_'+init_type + '_'+activation_type+'.pth')


# Convergence Plots



In [None]:
## Training Loss

plt.figure(1)
x_axis = range(1,(NUM_EPOCHS+1))
INIT_TYPE = 'default'

for activation_type in ACTIVATION_TYPE_LIST:
  training_loss = np.load('Results/'+dataset_name+'/training_loss_'+init_type + '_'+activation_type+'.npy')
  plt.plot(x_axis, training_loss, label = activation_type, linewidth=2)

plt.xlabel('Epoch', fontsize = 13)
plt.ylabel('Loss', fontsize = 13)
plt.title('Training Loss on '+dataset_name, fontsize = 15)
plt.legend(fontsize = 11)
plt.savefig(dataset_name+'_'+INIT_TYPE+"_training_loss.png", bbox_inches='tight')


In [None]:
## Validation Loss

plt.figure(2)
x_axis = range(1,(NUM_EPOCHS+1))
INIT_TYPE = 'default'

for activation_type in ACTIVATION_TYPE_LIST:
  validation_loss = np.load('Results/'+dataset_name+'/validation_loss_'+init_type + '_'+activation_type+'.npy')
  plt.plot(x_axis, validation_loss, label = activation_type, linewidth=2)

plt.xlabel('Epoch', fontsize = 13)
plt.ylabel('Loss', fontsize = 13)
plt.title('Validation Loss on '+dataset_name, fontsize = 15)
plt.legend(fontsize = 11)
plt.savefig(dataset_name+'_'+INIT_TYPE+"_validation_loss.png", bbox_inches='tight')


# Test Results

In [None]:
for init_type in INIT_TYPE_LIST:
  for activation_type in ACTIVATION_TYPE_LIST:

    if dataset_name == 'CIFAR-10':
      net = Net(NUM_CLASSES, activation_type)
    elif dataset_name == 'tiny-imagenet' or dataset_name == 'Caltech101':
      net = Net64(NUM_CLASSES, activation_type)
    net.load_state_dict(torch.load('Models/'+dataset_name+'_'+init_type + '_'+activation_type+'.pth'))
    net.to(device)
    
    correct = 0
    total = 0
    test_loss = 0.0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            # calculate outputs by running images through the network 
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            test_loss += criterion(outputs, labels)

    print('Accuracy of the '+ activation_type +' network on the 10000 test images: %.2f %%' % (
        100 * correct / total))
    
    print('Test loss on the 10000 test images: %.2f ' % (
      (test_loss/len(testloader))))