In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input"))

import time

# import pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam,lr_scheduler
from torch.utils.data import random_split
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

['cifar-10-python.tar.gz']


In [2]:
# define transformations for train
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=.40),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

# define transformations for test
test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])])

# define training dataloader
def get_training_dataloader(train_transform, batch_size=128, num_workers=0, shuffle=True):
    """ return training dataloader
    Args:
        train_transform: transfroms for train dataset
        path: path to cifar100 training python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: train_data_loader:torch dataloader object
    """

    transform_train = train_transform
    cifar10_training = torchvision.datasets.CIFAR10(root='.', train=True, download=True, transform=transform_train)
    cifar10_training_loader = DataLoader(
        cifar10_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar10_training_loader

# define test dataloader
def get_testing_dataloader(test_transform, batch_size=128, num_workers=0, shuffle=True):
    """ return training dataloader
    Args:
        test_transform: transforms for test dataset
        path: path to cifar100 test python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: cifar100_test_loader:torch dataloader object
    """

    transform_test = test_transform
    cifar10_test = torchvision.datasets.CIFAR10(root='.', train=False, download=True, transform=transform_test)
    cifar10_test_loader = DataLoader(
        cifar10_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar10_test_loader

In [3]:
# implement mish activation function
def f_mish(input, inplace = False):
    '''
    Applies the mish function element-wise:
    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
    '''
    return input * torch.tanh(F.softplus(input))

# implement class wrapper for mish activation function
class mish(nn.Module):
    '''
    Applies the mish function element-wise:
    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))

    Shape:
        - Input: (N, *) where * means, any number of additional
          dimensions
        - Output: (N, *), same shape as the input

    Examples:
        >>> m = mish()
        >>> input = torch.randn(2)
        >>> output = m(input)

    '''
    def __init__(self, inplace = False):
        '''
        Init method.
        '''
        super().__init__()
        self.inplace = inplace

    def forward(self, input):
        '''
        Forward pass of the function.
        '''
        return f_mish(input, inplace = self.inplace)

In [4]:
# implement swish activation function
def f_swish(input, inplace = False):
    '''
    Applies the swish function element-wise:
    swish(x) = x * sigmoid(x)
    '''
    return input * torch.sigmoid(input)

# implement class wrapper for swish activation function
class swish(nn.Module):
    '''
    Applies the swish function element-wise:
    swish(x) = x * sigmoid(x)

    Shape:
        - Input: (N, *) where * means, any number of additional
          dimensions
        - Output: (N, *), same shape as the input

    Examples:
        >>> m = swish()
        >>> input = torch.randn(2)
        >>> output = m(input)

    '''
    def __init__(self, inplace = False):
        '''
        Init method.
        '''
        super().__init__()
        self.inplace = inplace

    def forward(self, input):
        '''
        Forward pass of the function.
        '''
        return f_swish(input, inplace = self.inplace)

In [5]:
class BasicConv2d(nn.Module):

    def __init__(self, input_channels, output_channels, activation = 'relu', **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs)
        self.bn = nn.BatchNorm2d(output_channels)
        
        if activation == 'relu':
            self.relu = nn.ReLU(inplace=True)
            
        if activation == 'swish':
            self.relu = swish(inplace = True)
            
        if activation == 'mish':
            self.relu = mish(inplace = True)
    
    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = self.relu(x)

        return x

#same naive inception module
class InceptionA(nn.Module):

    def __init__(self, input_channels, pool_features, activation = 'relu'):
        super().__init__()
        self.branch1x1 = BasicConv2d(input_channels, 64, kernel_size=1, activation = activation)

        self.branch5x5 = nn.Sequential(
            BasicConv2d(input_channels, 48, kernel_size=1, activation = activation),
            BasicConv2d(48, 64, kernel_size=5, padding=2, activation = activation)
        )

        self.branch3x3 = nn.Sequential(
            BasicConv2d(input_channels, 64, kernel_size=1, activation = activation),
            BasicConv2d(64, 96, kernel_size=3, padding=1, activation = activation),
            BasicConv2d(96, 96, kernel_size=3, padding=1, activation = activation)
        )

        self.branchpool = nn.Sequential(
            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
            BasicConv2d(input_channels, pool_features, kernel_size=3, padding=1, activation = activation)
        )
    
    def forward(self, x):
        
        #x -> 1x1(same)
        branch1x1 = self.branch1x1(x)

        #x -> 1x1 -> 5x5(same)
        branch5x5 = self.branch5x5(x)
        #branch5x5 = self.branch5x5_2(branch5x5)

        #x -> 1x1 -> 3x3 -> 3x3(same)
        branch3x3 = self.branch3x3(x)

        #x -> pool -> 1x1(same)
        branchpool = self.branchpool(x)

        outputs = [branch1x1, branch5x5, branch3x3, branchpool]

        return torch.cat(outputs, 1)

#downsample
#Factorization into smaller convolutions
class InceptionB(nn.Module):

    def __init__(self, input_channels, activation = 'relu'):
        super().__init__()

        self.branch3x3 = BasicConv2d(input_channels, 384, kernel_size=3, stride=2, activation = activation)

        self.branch3x3stack = nn.Sequential(
            BasicConv2d(input_channels, 64, kernel_size=1, activation = activation),
            BasicConv2d(64, 96, kernel_size=3, padding=1, activation = activation),
            BasicConv2d(96, 96, kernel_size=3, stride=2, activation = activation)
        )

        self.branchpool = nn.MaxPool2d(kernel_size=3, stride=2)

    def forward(self, x):

        #x - > 3x3(downsample)
        branch3x3 = self.branch3x3(x)

        #x -> 3x3 -> 3x3(downsample)
        branch3x3stack = self.branch3x3stack(x)

        #x -> avgpool(downsample)
        branchpool = self.branchpool(x)

        #"""We can use two parallel stride 2 blocks: P and C. P is a pooling 
        #layer (either average or maximum pooling) the activation, both of 
        #them are stride 2 the filter banks of which are concatenated as in 
        #figure 10."""
        outputs = [branch3x3, branch3x3stack, branchpool]

        return torch.cat(outputs, 1)
    
#Factorizing Convolutions with Large Filter Size
class InceptionC(nn.Module):
    def __init__(self, input_channels, channels_7x7, activation = 'relu'):
        super().__init__()
        self.branch1x1 = BasicConv2d(input_channels, 192, kernel_size=1, activation = activation)

        c7 = channels_7x7

        #In theory, we could go even further and argue that one can replace any n × n 
        #convolution by a 1 × n convolution followed by a n × 1 convolution and the 
        #computational cost saving increases dramatically as n grows (see figure 6).
        self.branch7x7 = nn.Sequential(
            BasicConv2d(input_channels, c7, kernel_size=1, activation = activation),
            BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0), activation = activation),
            BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3), activation = activation)
        )

        self.branch7x7stack = nn.Sequential(
            BasicConv2d(input_channels, c7, kernel_size=1, activation = activation),
            BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0), activation = activation),
            BasicConv2d(c7, c7, kernel_size=(1, 7), padding=(0, 3), activation = activation),
            BasicConv2d(c7, c7, kernel_size=(7, 1), padding=(3, 0), activation = activation),
            BasicConv2d(c7, 192, kernel_size=(1, 7), padding=(0, 3), activation = activation)
        )

        self.branch_pool = nn.Sequential(
            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
            BasicConv2d(input_channels, 192, kernel_size=1, activation = activation),
        )

    def forward(self, x):

        #x -> 1x1(same)
        branch1x1 = self.branch1x1(x)

        #x -> 1layer 1*7 and 7*1 (same)
        branch7x7 = self.branch7x7(x)

        #x-> 2layer 1*7 and 7*1(same)
        branch7x7stack = self.branch7x7stack(x)

        #x-> avgpool (same)
        branchpool = self.branch_pool(x)

        outputs = [branch1x1, branch7x7, branch7x7stack, branchpool]

        return torch.cat(outputs, 1)

class InceptionD(nn.Module):

    def __init__(self, input_channels, activation = 'relu'):
        super().__init__()

        self.branch3x3 = nn.Sequential(
            BasicConv2d(input_channels, 192, kernel_size=1, activation = activation),
            BasicConv2d(192, 320, kernel_size=3, stride=2, activation = activation)
        )

        self.branch7x7 = nn.Sequential(
            BasicConv2d(input_channels, 192, kernel_size=1, activation = activation),
            BasicConv2d(192, 192, kernel_size=(1, 7), padding=(0, 3), activation = activation),
            BasicConv2d(192, 192, kernel_size=(7, 1), padding=(3, 0), activation = activation),
            BasicConv2d(192, 192, kernel_size=3, stride=2, activation = activation)
        )

        self.branchpool = nn.AvgPool2d(kernel_size=3, stride=2)
    
    def forward(self, x):

        #x -> 1x1 -> 3x3(downsample)
        branch3x3 = self.branch3x3(x)

        #x -> 1x1 -> 1x7 -> 7x1 -> 3x3 (downsample)
        branch7x7 = self.branch7x7(x)

        #x -> avgpool (downsample)
        branchpool = self.branchpool(x)

        outputs = [branch3x3, branch7x7, branchpool]

        return torch.cat(outputs, 1)
    

#same
class InceptionE(nn.Module):
    def __init__(self, input_channels, activation = 'relu'):
        super().__init__()
        self.branch1x1 = BasicConv2d(input_channels, 320, kernel_size=1, activation = activation)

        self.branch3x3_1 = BasicConv2d(input_channels, 384, kernel_size=1, activation = activation)
        self.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1), activation = activation)
        self.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0), activation = activation)
            
        self.branch3x3stack_1 = BasicConv2d(input_channels, 448, kernel_size=1, activation = activation)
        self.branch3x3stack_2 = BasicConv2d(448, 384, kernel_size=3, padding=1, activation = activation)
        self.branch3x3stack_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1), activation = activation)
        self.branch3x3stack_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0), activation = activation)

        self.branch_pool = nn.Sequential(
            nn.AvgPool2d(kernel_size=3, stride=1, padding=1),
            BasicConv2d(input_channels, 192, kernel_size=1, activation = activation)
        )

    def forward(self, x):

        #x -> 1x1 (same)
        branch1x1 = self.branch1x1(x)

        # x -> 1x1 -> 3x1
        # x -> 1x1 -> 1x3
        # concatenate(3x1, 1x3)
        #"""7. Inception modules with expanded the filter bank outputs. 
        #This architecture is used on the coarsest (8 × 8) grids to promote 
        #high dimensional representations, as suggested by principle 
        #2 of Section 2."""
        branch3x3 = self.branch3x3_1(x)
        branch3x3 = [
            self.branch3x3_2a(branch3x3),
            self.branch3x3_2b(branch3x3)
        ]
        branch3x3 = torch.cat(branch3x3, 1)

        # x -> 1x1 -> 3x3 -> 1x3
        # x -> 1x1 -> 3x3 -> 3x1
        #concatenate(1x3, 3x1)
        branch3x3stack = self.branch3x3stack_1(x)
        branch3x3stack = self.branch3x3stack_2(branch3x3stack)
        branch3x3stack = [
            self.branch3x3stack_3a(branch3x3stack),
            self.branch3x3stack_3b(branch3x3stack)
        ]
        branch3x3stack = torch.cat(branch3x3stack, 1)

        branchpool = self.branch_pool(x)

        outputs = [branch1x1, branch3x3, branch3x3stack, branchpool]

        return torch.cat(outputs, 1)

class InceptionV3(nn.Module):
    
    def __init__(self, num_classes=10, activation = 'relu'):
        super().__init__()
        self.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, padding=1, activation = activation)
        self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3, padding=1, activation = activation)
        self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1, activation = activation)
        self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1, activation = activation)
        self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3, activation = activation)

        #naive inception module
        self.Mixed_5b = InceptionA(192, pool_features=32, activation = activation)
        self.Mixed_5c = InceptionA(256, pool_features=64, activation = activation)
        self.Mixed_5d = InceptionA(288, pool_features=64, activation = activation)

        #downsample
        self.Mixed_6a = InceptionB(288, activation = activation)

        self.Mixed_6b = InceptionC(768, channels_7x7=128, activation = activation)
        self.Mixed_6c = InceptionC(768, channels_7x7=160, activation = activation)
        self.Mixed_6d = InceptionC(768, channels_7x7=160, activation = activation)
        self.Mixed_6e = InceptionC(768, channels_7x7=192, activation = activation)

        #downsample
        self.Mixed_7a = InceptionD(768, activation = activation)

        self.Mixed_7b = InceptionE(1280, activation = activation)
        self.Mixed_7c = InceptionE(2048, activation = activation)
        
        #6*6 feature size
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout2d()
        self.linear = nn.Linear(2048, num_classes)

    def forward(self, x):

        #32 -> 30
        x = self.Conv2d_1a_3x3(x)
        x = self.Conv2d_2a_3x3(x)
        x = self.Conv2d_2b_3x3(x)
        x = self.Conv2d_3b_1x1(x)
        x = self.Conv2d_4a_3x3(x)

        #30 -> 30
        x = self.Mixed_5b(x)
        x = self.Mixed_5c(x)
        x = self.Mixed_5d(x)

        #30 -> 14
        #Efficient Grid Size Reduction to avoid representation
        #bottleneck
        x = self.Mixed_6a(x)

        #14 -> 14
        #"""In practice, we have found that employing this factorization does not 
        #work well on early layers, but it gives very good results on medium 
        #grid-sizes (On m × m feature maps, where m ranges between 12 and 20). 
        #On that level, very good results can be achieved by using 1 × 7 convolutions 
        #followed by 7 × 1 convolutions."""
        x = self.Mixed_6b(x)
        x = self.Mixed_6c(x)
        x = self.Mixed_6d(x)
        x = self.Mixed_6e(x)

        #14 -> 6
        #Efficient Grid Size Reduction
        x = self.Mixed_7a(x)

        #6 -> 6
        #We are using this solution only on the coarsest grid, 
        #since that is the place where producing high dimensional 
        #sparse representation is the most critical as the ratio of 
        #local processing (by 1 × 1 convolutions) is increased compared 
        #to the spatial aggregation."""
        x = self.Mixed_7b(x)
        x = self.Mixed_7c(x)

        #6 -> 1
        x = self.avgpool(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        x = self.linear(x)
        return x


def inceptionv3(activation = 'relu'):
    return InceptionV3(activation = activation)

In [6]:
trainloader = get_training_dataloader(train_transform)
testloader = get_testing_dataloader(test_transform)

0it [00:00, ?it/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz


170500096it [00:06, 28307736.60it/s]                               


Files already downloaded and verified


In [7]:
epochs = 100
batch_size = 128
learning_rate = 0.001
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [8]:
model = inceptionv3(activation = 'mish')

In [9]:
# set loss function
criterion = nn.CrossEntropyLoss()

# set optimizer, only train the classifier parameters, feature parameters are frozen
optimizer = Adam(model.parameters(), lr=learning_rate)

In [10]:
train_stats = pd.DataFrame(columns = ['Epoch', 'Time per epoch', 'Avg time per step', 'Train loss', 'Train accuracy', 'Train top-3 accuracy','Test loss', 'Test accuracy', 'Test top-3 accuracy']) 

In [11]:
#train the model
model.to(device)

steps = 0
running_loss = 0
for epoch in range(epochs):
    
    since = time.time()
    
    train_accuracy = 0
    top3_train_accuracy = 0 
    for inputs, labels in trainloader:
        steps += 1
        # Move input and label tensors to the default device
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        logps = model.forward(inputs)
        loss = criterion(logps, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        # calculate train top-1 accuracy
        ps = torch.exp(logps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        train_accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
        
        # Calculate train top-3 accuracy
        np_top3_class = ps.topk(3, dim=1)[1].cpu().numpy()
        target_numpy = labels.cpu().numpy()
        top3_train_accuracy += np.mean([1 if target_numpy[i] in np_top3_class[i] else 0 for i in range(0, len(target_numpy))])
        
    time_elapsed = time.time() - since
    
    test_loss = 0
    test_accuracy = 0
    top3_test_accuracy = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            logps = model.forward(inputs)
            batch_loss = criterion(logps, labels)

            test_loss += batch_loss.item()

            # Calculate test top-1 accuracy
            ps = torch.exp(logps)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == labels.view(*top_class.shape)
            test_accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
            
            # Calculate test top-3 accuracy
            np_top3_class = ps.topk(3, dim=1)[1].cpu().numpy()
            target_numpy = labels.cpu().numpy()
            top3_test_accuracy += np.mean([1 if target_numpy[i] in np_top3_class[i] else 0 for i in range(0, len(target_numpy))])

    print(f"Epoch {epoch+1}/{epochs}.. "
          f"Time per epoch: {time_elapsed:.4f}.. "
          f"Average time per step: {time_elapsed/len(trainloader):.4f}.. "
          f"Train loss: {running_loss/len(trainloader):.4f}.. "
          f"Train accuracy: {train_accuracy/len(trainloader):.4f}.. "
          f"Top-3 train accuracy: {top3_train_accuracy/len(trainloader):.4f}.. "
          f"Test loss: {test_loss/len(testloader):.4f}.. "
          f"Test accuracy: {test_accuracy/len(testloader):.4f}.. "
          f"Top-3 test accuracy: {top3_test_accuracy/len(testloader):.4f}")

    train_stats = train_stats.append({'Epoch': epoch, 'Time per epoch':time_elapsed, 'Avg time per step': time_elapsed/len(trainloader), 'Train loss' : running_loss/len(trainloader), 'Train accuracy': train_accuracy/len(trainloader), 'Train top-3 accuracy':top3_train_accuracy/len(trainloader),'Test loss' : test_loss/len(testloader), 'Test accuracy': test_accuracy/len(testloader), 'Test top-3 accuracy':top3_test_accuracy/len(testloader)}, ignore_index=True)

    running_loss = 0
    model.train()

Epoch 1/100.. Time per epoch: 258.2529.. Average time per step: 0.6605.. Train loss: 1.6494.. Train accuracy: 0.3758.. Top-3 train accuracy: 0.7361.. Test loss: 1.6536.. Test accuracy: 0.4223.. Top-3 test accuracy: 0.7733
Epoch 2/100.. Time per epoch: 256.9586.. Average time per step: 0.6572.. Train loss: 1.2841.. Train accuracy: 0.5294.. Top-3 train accuracy: 0.8424.. Test loss: 1.1048.. Test accuracy: 0.5961.. Top-3 test accuracy: 0.8795
Epoch 3/100.. Time per epoch: 257.0979.. Average time per step: 0.6575.. Train loss: 1.1006.. Train accuracy: 0.6029.. Top-3 train accuracy: 0.8786.. Test loss: 0.9879.. Test accuracy: 0.6528.. Top-3 test accuracy: 0.8989
Epoch 4/100.. Time per epoch: 257.2303.. Average time per step: 0.6579.. Train loss: 0.9694.. Train accuracy: 0.6528.. Top-3 train accuracy: 0.9037.. Test loss: 0.8478.. Test accuracy: 0.7012.. Top-3 test accuracy: 0.9224
Epoch 5/100.. Time per epoch: 257.2799.. Average time per step: 0.6580.. Train loss: 0.8626.. Train accuracy: 0.

In [12]:
train_stats.to_csv('train_log_InceptionV3_Mish.csv')