### Assignment 5: Crating your own DNN for CIFAR-10

In [41]:
import argparse
import os, sys
import time
import datetime
import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image
%matplotlib inline

# Import pytorch dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
from torch.utils.data import Dataset as Dataset

# You cannot change this line.
from tools.dataloader import CIFAR10

In [42]:
#Hyper-parameter
TRAIN_BATCH_SIZE = 128
VAL_BATCH_SIZE = 100
TEST_BATCH_SIZE = 100
INITIAL_LR = 0.01
MOMENTUM = 0.9
REG = 1e-2
EPOCHS = 35
CHECKPOINT_PATH = "./saved_models"

In [43]:
#loading datasets
data_path = './data/cifar10_train_val/cifar10-batches-'
def load_dataset(dataset_type, data_path):
    if dataset_type == 'train':
        X = np.load(data_path + 'images-train.npy')
        y = np.load(data_path + 'labels-train.npy')
        return (X,y)
        
    if dataset_type == 'val':
        X = np.load(data_path + 'images-val.npy')
        y = np.load(data_path + 'labels-val.npy')
        return (X,y)
    
    if dataset_type  == 'test':
        X = np.load('./data/cifar10-batches-' + 'images-test.npy')
        return X

In [44]:
#transform function
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding = 4),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                        std=[0.2023, 0.1994, 0.2010])])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                        std=[0.2023, 0.1994, 0.2010])])
transform_test = transforms.Compose([
    #transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465],
                        std=[0.2023, 0.1994, 0.2010])])

In [45]:
#data loader
class convert_data(Dataset):
    def __init__(self, imgs, labels, transform):
        self.X = imgs
        self.y = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        img = Image.fromarray(self.X[idx])
        if self.transform != None:
            img = self.transform(img)
        return img, self.y[idx]

train_dataset = convert_data(load_dataset('train', data_path)[0], load_dataset('train', data_path)[1], transform = transform_train)
val_dataset = convert_data(load_dataset('val', data_path)[0], load_dataset('val', data_path)[1], transform = transform_val)


y_test_placeholder = [-1] * load_dataset('test', data_path).shape[0]
test_dataset = convert_data(load_dataset('test', data_path), y_test_placeholder, transform = transform_test)


train_loader = torch.utils.data.DataLoader(train_dataset, shuffle = True, batch_size = TRAIN_BATCH_SIZE, num_workers = 1)
val_loader = torch.utils.data.DataLoader(val_dataset, shuffle = True, batch_size = VAL_BATCH_SIZE, num_workers = 1)
test_loader = torch.utils.data.DataLoader(test_dataset, shuffle = False, batch_size = TEST_BATCH_SIZE, num_workers = 1)

In [46]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device =='cuda':
    print("Train on GPU...")
else:
    print("Train on CPU...")

Train on GPU...


In [51]:
# model
# basicBlock is the basic unit of resNet, each block contains two convolution layers
class basicBlock(nn.Module):
    
    def __init__(self, inputChannel, outputChannel, stride):
        super(basicBlock, self).__init__()
       
        # the first convolution layer has stride of 1 or 2, depends on the function parameter 
        self.conv1 = nn.Conv2d(inputChannel, outputChannel, kernel_size = (3,3), stride = stride, padding = 1, bias=False)
        self.bn1 = nn.BatchNorm2d(outputChannel)
        self.relu1 = nn.ReLU(inplace = True)
        
        # the second convolution layer always has stride of 1
        self.conv2 = nn.Conv2d(outputChannel, outputChannel, kernel_size = (3,3), stride = 1, padding = 1, bias=False)
        self.bn2= nn.BatchNorm2d(outputChannel)
        
        # x and F(x) are supposed to maintain the same dimension (inputchannel == outputchannel), then no downsampling needed
        self.downsample = None
    
        if inputChannel != outputChannel or stride == 2:
            # need downsampling on F(x) to match dimension, apply 1*1 kernel size, stride = 2
            self.downsample = nn.Sequential(nn.Conv2d(inputChannel, outputChannel, kernel_size=(1,1), stride=stride, bias=False),
                                            nn.BatchNorm2d(outputChannel))
                                          
        
    def forward(self, x):
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.conv2(out)
        out = self.bn2(out)
        
        if self.downsample == None:
            out += x       
        else:
            out += self.downsample(x)
            
        out = F.relu(out)
        return out

class resNet18(nn.Module):
    """
    a resNet18 has one convolution layer and 4 other layers, each of them contains two blocks
    """
    def __init__(self, basicBlock):
        super(resNet18, self).__init__()
        
        #original channel
        self.inputChannel = 64
        
        # first convolutional
        self.conv0 = nn.Conv2d(3, 64, kernel_size = (3,3), stride = 1, padding = 1, bias=False)
        self.batch0 = nn.BatchNorm2d(64)
        self.relu0 = nn.ReLU()
        
        # each layer has two blocks, call generate_layer to connect each block in a layer       
        self.layer1 = self.generate_layer(basicBlock, 64, 1)
        self.layer2 = self.generate_layer(basicBlock, 128, 2)
        self.layer3 = self.generate_layer(basicBlock, 256, 2)
        self.layer4 = self.generate_layer(basicBlock, 512, 2)
        
        # output feature nnumber = 10
        self.fully_connect = nn.Linear(512,10)
        
    def generate_layer(self, basicBlock, layerChannel, stride):
        """ connect each block in a layer 
        parameters:
        basicBlock: the minimal unit defined above of a resNet
        layerChannel: number of channels in different layers
        """
        # the first block starts with the stride defined in the funtion parameter for the first conv, and 1 for the secon conv
        # the second block in each layer has a stride of 1 for both convs      
        strideList = [stride, 1] 
        blockList = []
        for i in range(2):
            blockList.append(basicBlock(self.inputChannel, layerChannel, strideList[i]))
            # update the input channel for the next layer
            self.inputChannel = layerChannel
            
        return nn.Sequential(blockList[0], blockList[1])

    def forward(self, x):
        out = self.conv0(x)
        out = self.batch0(out)
        out = self.relu0(out)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.fully_connect(out)
        return out

In [52]:
#send the model to GPU
net = resNet18(basicBlock)
net = net.to(device)

In [53]:
#Define loss function
criterion = nn.CrossEntropyLoss()
# Add optimizer
optimizer = optim.SGD(net.parameters(), lr=INITIAL_LR, momentum=0.9, weight_decay=REG)

In [54]:
# FLAG for loading the pretrained model
TRAIN_FROM_SCRATCH = False
# Code for loading checkpoint and recover epoch id.
CKPT_PATH = "./saved_model/model_fz49.h5"
def get_checkpoint(ckpt_path):
    try:
        ckpt = torch.load(ckpt_path)
    except Exception as e:
        print(e)
        return None
    return ckpt

ckpt = get_checkpoint(CKPT_PATH)
if ckpt is None or TRAIN_FROM_SCRATCH:
    if not TRAIN_FROM_SCRATCH:
        print("Checkpoint not found.")
    print("Training from scratch ...")
    start_epoch = 0
    current_learning_rate = INITIAL_LR
else:
    print("Successfully loaded checkpoint: %s" %CKPT_PATH)
    net.load_state_dict(ckpt['net'])
    start_epoch = ckpt['epoch'] + 1
    current_learning_rate = ckpt['lr']
    print("Starting from epoch %d " %start_epoch)

print("Starting from learning rate %f:" %current_learning_rate)

global_step = 0
best_val_acc = 0

trainloss_list = list()
valloss_list = list()
trainacc_list = list()
valacc_list = list()

for i in range(start_epoch, EPOCHS):
    print(datetime.datetime.now())
    # Switch to train mode
    net.train()
    print("Epoch %d:" %i)

    total_examples = 0
    correct_examples = 0

    train_loss = 0
    train_acc = 0
    # Train the training dataset for 1 epoch.
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        # Copy inputs to device
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Zero the gradient
        optimizer.zero_grad()
        # Generate output
        outputs = net(inputs)
        loss = criterion(outputs, targets)
        # Now backward loss
        loss.backward()
        # Apply gradient
        optimizer.step()
        # Calculate predicted labels
        _, predicted = torch.max(outputs.data, 1)
        # Calculate accuracy
        total_examples += targets.size(0)
        correct_examples += torch.sum(predicted == targets.data).float()

        train_loss += loss

        global_step += 1
        if global_step % 100 == 0:
            avg_loss = train_loss / (batch_idx + 1)
        pass
    avg_acc = correct_examples / total_examples
    
    trainloss_list.append(avg_loss)
    trainacc_list.append(avg_acc)
    
    print("Training loss: %.4f, Training accuracy: %.4f" %(avg_loss, avg_acc))
    print(datetime.datetime.now())
    # Validate on the validation dataset
    print("Validation...")
    total_examples = 0
    correct_examples = 0
    
    net.eval()

    val_loss = 0
    val_acc = 0
    # Disable gradient during validation
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            # Copy inputs to device
            inputs = inputs.to(device)
            targets = targets.to(device)
            # Zero the gradient
            optimizer.zero_grad()
            # Generate output from the DNN.
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            # Calculate predicted labels
            _, predicted = torch.max(outputs.data, 1)
            # Calculate accuracy
            total_examples += targets.size(0)
            correct_examples += torch.sum(predicted == targets.data).float()
            val_loss += loss

    avg_loss = val_loss / len(val_loader)
    avg_acc = correct_examples / total_examples
    print("Validation loss: %.4f, Validation accuracy: %.4f" % (avg_loss, avg_acc))
    
    valloss_list.append(avg_loss)
    valacc_list.append(avg_acc)
    
    DECAY_EPOCHS = 2
    DECAY = 0.8
    if i % DECAY_EPOCHS == 0 and i != 0:
        for param_group in optimizer.param_groups:
            # Assign the learning rate parameter
            current_learning_rate = current_learning_rate*DECAY
            param_group['lr'] = current_learning_rate
            print("Current learning rate has decayed to %f" %current_learning_rate)
    
    # Save for checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        if not os.path.exists(CHECKPOINT_PATH):
            os.makedirs(CHECKPOINT_PATH)
        print("Saving ...")
        state = {'net': net.state_dict(),
                 'epoch': i,
                 'lr': current_learning_rate}
        torch.save(state, os.path.join(CHECKPOINT_PATH, 'model_fz49.h5'))

print("Optimization finished.")

[Errno 2] No such file or directory: './saved_model/model_fz49.h5'
Checkpoint not found.
Training from scratch ...
Starting from learning rate 0.010000:
2019-10-11 01:42:40.662652
Epoch 0:
Training loss: 1.4812, Training accuracy: 0.4755
2019-10-11 01:43:01.255472
Validation...
Validation loss: 1.1993, Validation accuracy: 0.5766
Saving ...
2019-10-11 01:43:03.157194
Epoch 1:
Training loss: 0.9400, Training accuracy: 0.6679
2019-10-11 01:43:23.726380
Validation...
Validation loss: 0.8836, Validation accuracy: 0.6808
Saving ...
2019-10-11 01:43:25.833775
Epoch 2:
Training loss: 0.7595, Training accuracy: 0.7367
2019-10-11 01:43:46.544914
Validation...
Validation loss: 0.8542, Validation accuracy: 0.7076
Current learning rate has decayed to 0.008000
Saving ...
2019-10-11 01:43:48.243544
Epoch 3:
Training loss: 0.6325, Training accuracy: 0.7842
2019-10-11 01:44:09.023547
Validation...
Validation loss: 0.7120, Validation accuracy: 0.7528
Saving ...
2019-10-11 01:44:11.111736
Epoch 4:
Train

In [55]:
#load the model
CKPT_PATH = "./saved_models/model_fz49.h5"
ckpt = get_checkpoint(CKPT_PATH)
net.load_state_dict(ckpt['net'])
start_epoch = ckpt['epoch'] + 1
current_learning_rate = ckpt['lr']

def make_prediction(test_loader, TEST_BATCH_SIZE = TEST_BATCH_SIZE):
    prediction = np.zeros([10000,2])
    prediction[:,0] = np.arange(10000)
    net.eval()
    with torch.no_grad():
        for idx, (inputs, targets) in enumerate(test_loader):
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            prediction[idx*TEST_BATCH_SIZE:(idx*TEST_BATCH_SIZE+len(predicted)), 1] =predicted.cpu().data.numpy()
    np.savetxt('ECE590_prediction.csv', prediction.astype(int), fmt = "%d", delimiter=',', header='Id,Category', comments='')

In [56]:
make_prediction(test_loader)