
# **Training a deep Convolutional Neural Network:**
In this notebook, we will create an implementation of the AlexNet architecture and use it to classify images from the CIFAR10 dataset. We will train the network using two different optimization methods on a subset of the training data.

**Q1:** 

We will use stochastic gradient descent (SGD) with a learning rate of $0.005$ to train the network on $10%$ of the training data.
Then, we will use the Adam optimizer with a learning rate of $0.00005$ to train the network on the same $10%$ of the training data.



In [1]:
import numpy as np
import torch
import torch.nn as nn
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
    ])
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
        ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=train_transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=valid_transform,
    )

    num_train = len(train_dataset)

    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_idx = train_idx[: int(np.floor(len(train_idx) * 0.1))]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)


    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)
 
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,
                    batch_size,
                    shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )

    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )

    return data_loader


# CIFAR10 dataset 
train_loader, valid_loader = get_train_valid_loader(data_dir = './data', batch_size = 64,
                       augment = False, random_seed = 1)

test_loader = get_test_loader(data_dir = './data',
                              batch_size = 64)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 47595602.67it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified


In [3]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

## **Q1:**
The network will be trained using the SGD optimizer on the training data with a learning rate of $0.005$, and the output of the section below will print the Loss of each Epoch and Accuracy of the network on the $5000$ validation images of each epoch.

In [4]:
num_classes = 10
num_epochs = 10
batch_size = 64
learning_rate = 0.005

model = AlexNet(num_classes).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  


In [5]:
# Train the model

total_step = len(train_loader)


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, running_loss / len(train_loader)))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/10], Step [71/71], Loss: 2.1869
Accuracy of the network on the 5000 validation images: 25.34 %
Epoch [2/10], Step [71/71], Loss: 1.8636
Accuracy of the network on the 5000 validation images: 29.64 %
Epoch [3/10], Step [71/71], Loss: 1.6622
Accuracy of the network on the 5000 validation images: 35.84 %
Epoch [4/10], Step [71/71], Loss: 1.5509
Accuracy of the network on the 5000 validation images: 39.3 %
Epoch [5/10], Step [71/71], Loss: 1.4531
Accuracy of the network on the 5000 validation images: 39.84 %
Epoch [6/10], Step [71/71], Loss: 1.3914
Accuracy of the network on the 5000 validation images: 39.58 %
Epoch [7/10], Step [71/71], Loss: 1.3356
Accuracy of the network on the 5000 validation images: 41.9 %
Epoch [8/10], Step [71/71], Loss: 1.2625
Accuracy of the network on the 5000 validation images: 45.22 %
Epoch [9/10], Step [71/71], Loss: 1.2145
Accuracy of the network on the 5000 validation images: 43.3 %
Epoch [10/10], Step [71/71], Loss: 1.1383
Accuracy of the network o

In [6]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total)) 

Accuracy of the network on the 10000 test images: 49.41 %



The network will be trained using the Adam optimizer on the training data with a learning rate of $0.00005$, and the output of the section below will print the Loss of each Epoch and Accuracy of the network on the $5000$ validation images of each epoch.

In [7]:
learning_rate = 0.00005


model = AlexNet(num_classes).to(device)

# Loss and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  


In [8]:
# Train the model

total_step = len(train_loader)


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, running_loss / len(train_loader)))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/10], Step [71/71], Loss: 1.8515
Accuracy of the network on the 5000 validation images: 39.3 %
Epoch [2/10], Step [71/71], Loss: 1.4774
Accuracy of the network on the 5000 validation images: 43.24 %
Epoch [3/10], Step [71/71], Loss: 1.3239
Accuracy of the network on the 5000 validation images: 49.3 %
Epoch [4/10], Step [71/71], Loss: 1.1852
Accuracy of the network on the 5000 validation images: 51.4 %
Epoch [5/10], Step [71/71], Loss: 1.0820
Accuracy of the network on the 5000 validation images: 54.82 %
Epoch [6/10], Step [71/71], Loss: 0.9717
Accuracy of the network on the 5000 validation images: 56.08 %
Epoch [7/10], Step [71/71], Loss: 0.8601
Accuracy of the network on the 5000 validation images: 57.02 %
Epoch [8/10], Step [71/71], Loss: 0.8062
Accuracy of the network on the 5000 validation images: 60.98 %
Epoch [9/10], Step [71/71], Loss: 0.7059
Accuracy of the network on the 5000 validation images: 60.62 %
Epoch [10/10], Step [71/71], Loss: 0.6086
Accuracy of the network o

In [9]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total)) 

Accuracy of the network on the 10000 test images: 62.49 %


##**Q2:**
The network will be trained using the Adam optimizer on the training data with a learning rate of $0.005$, and the output of the section below will print the Loss of each Epoch and Accuracy of the network on the $5000$ validation images of each epoch.

In [10]:
learning_rate = 0.005

model = AlexNet(num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  


In [11]:
# Train the model

total_step = len(train_loader)


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, running_loss / len(train_loader)))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/10], Step [71/71], Loss: 19.3668
Accuracy of the network on the 5000 validation images: 13.18 %
Epoch [2/10], Step [71/71], Loss: 2.4899
Accuracy of the network on the 5000 validation images: 14.44 %
Epoch [3/10], Step [71/71], Loss: 2.3805
Accuracy of the network on the 5000 validation images: 15.18 %
Epoch [4/10], Step [71/71], Loss: 2.2670
Accuracy of the network on the 5000 validation images: 14.34 %
Epoch [5/10], Step [71/71], Loss: 2.2545
Accuracy of the network on the 5000 validation images: 14.1 %
Epoch [6/10], Step [71/71], Loss: 2.2376
Accuracy of the network on the 5000 validation images: 14.66 %
Epoch [7/10], Step [71/71], Loss: 2.1805
Accuracy of the network on the 5000 validation images: 15.12 %
Epoch [8/10], Step [71/71], Loss: 2.1845
Accuracy of the network on the 5000 validation images: 14.66 %
Epoch [9/10], Step [71/71], Loss: 2.1951
Accuracy of the network on the 5000 validation images: 15.32 %
Epoch [10/10], Step [71/71], Loss: 2.2486
Accuracy of the networ

Using the Adam optimizer with a learning rate of $0.005$ resulted in suboptimal performance, as reflected in a higher loss function and lower accuracy on the validation images compared to learning rate of $0.00005$.

In [12]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total)) 

Accuracy of the network on the 10000 test images: 13.63 %


##**Q3:**
The network architecture so that it doesn't include Batch Normalization or Dropout layers.
we will train using the Adam optimizer on the training data with a learning rate of $0.00005$, and the output of the section below will print the Loss of each Epoch and Accuracy of the network on the $5000$ validation images of each epoch.

In [13]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            # nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            # nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            # nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            # nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
            # nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            # nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [14]:
learning_rate = 0.00005


model = AlexNet(num_classes).to(device)

# Loss and optimizer

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  


In [15]:
# Train the model

total_step = len(train_loader)


for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        running_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, running_loss / len(train_loader)))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/10], Step [71/71], Loss: 1.8002
Accuracy of the network on the 5000 validation images: 39.96 %
Epoch [2/10], Step [71/71], Loss: 1.4762
Accuracy of the network on the 5000 validation images: 48.74 %
Epoch [3/10], Step [71/71], Loss: 1.2711
Accuracy of the network on the 5000 validation images: 51.84 %
Epoch [4/10], Step [71/71], Loss: 1.1043
Accuracy of the network on the 5000 validation images: 54.6 %
Epoch [5/10], Step [71/71], Loss: 0.9780
Accuracy of the network on the 5000 validation images: 53.5 %
Epoch [6/10], Step [71/71], Loss: 0.8312
Accuracy of the network on the 5000 validation images: 57.66 %
Epoch [7/10], Step [71/71], Loss: 0.6920
Accuracy of the network on the 5000 validation images: 56.42 %
Epoch [8/10], Step [71/71], Loss: 0.5758
Accuracy of the network on the 5000 validation images: 62.44 %
Epoch [9/10], Step [71/71], Loss: 0.4355
Accuracy of the network on the 5000 validation images: 60.12 %
Epoch [10/10], Step [71/71], Loss: 0.3141
Accuracy of the network 

In [16]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total)) 

Accuracy of the network on the 10000 test images: 61.86 %


it is seems like there is does'nt change the accelerate the learning process. 

##**Bonus Section**