In [None]:
import os
import torch
import torch.nn as nn
import torch.optim  as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm


![VGG Architecture](architecture_images\VGG_16_architecture.png)

In [2]:
VGG_types = {
    "VGG11": [64, "M", 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
    "VGG13": [64, 64, "M", 128, 128, "M", 256, 256, "M", 512, 512, "M", 512, 512, "M"],
    "VGG16": [
        64,
        64,
        "M",
        128,
        128,
        "M",
        256,
        256,
        256,
        "M",
        512,
        512,
        512,
        "M",
        512,
        512,
        512,
        "M",
    ],
    "VGG19": [
        64,
        64,
        "M",
        128,
        128,
        "M",
        256,
        256,
        256,
        256,
        "M",
        512,
        512,
        512,
        512,
        "M",
        512,
        512,
        512,
        512,
        "M",
    ],
} 

# k=3, p=1, s=1 for conv layers and max pools k=2,s=2
# Remaining --> 'FC - 4096', 'FC- 4096', 'FC- 1000'

In [3]:
class VGG_Net(nn.Module):
    def __init__(self, in_channels=3, num_classes=1000, architecture='VGG16'):
        super(VGG_Net, self).__init__()
        self.in_channels = in_channels
        self.conv_layers = self.create_conv_layers(VGG_types[architecture])
        self.fcs = nn.Sequential(
            nn.Linear(512*7*7, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5), ## Dropouts only in linear layers
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fcs(x)
        return x

    def create_conv_layers(self, architecture):

        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == int:
                out_channels = x

                layers += [nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
                                     kernel_size=(3,3), padding=(1,1), stride=(1,1)),
                                     nn.BatchNorm2d(x), ## this wasn't included in the paper
                                     nn.ReLU()]
                in_channels = x
            elif x == 'M':
                layers += [nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))]

        return nn.Sequential(*layers)
                        

                

In [4]:
model = VGG_Net(architecture='VGG16')

In [5]:
x = torch.randn(1, 3, 224, 224)
output = model(x)
print(output.shape)

torch.Size([1, 1000])


## Creating train, val and test split

In [6]:
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [7]:
data_dir = 'cats_vs_dogs_mini_dataset'
dataset = datasets.ImageFolder(os.path.join(data_dir), transform=data_transforms['train'])

In [8]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

In [9]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [10]:
val_dataset.dataset.transform = data_transforms['val']
test_dataset.dataset.transform = data_transforms['test']

In [11]:
batch_size = 24

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [12]:
train_data_iter = iter(train_loader)

# Get the next batch
images, labels = next(train_data_iter)
print(images.shape)
print(labels)
print(train_dataset.dataset.class_to_idx)

torch.Size([24, 3, 224, 224])
tensor([0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0])
{'cats_set': 0, 'dogs_set': 1}


In [13]:
train_dataset.dataset.classes

['cats_set', 'dogs_set']

## Setting up the device

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [15]:
model = VGG_Net(num_classes=len(train_dataset.dataset.classes), architecture='VGG11').to(device)

In [16]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## Training

In [None]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=3, save_path="saved_best_models/vgg_best_model.pth"):
    best_val_loss = float('inf')  # Initialize best validation loss to a large value
    epochs_no_improve = 0  # Counter for early stopping
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            
            # Backward and optimize
            loss.backward()
            optimizer.step()
            
            # Calculate train accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss /= len(train_loader)
        train_accuracy = 100 * correct / total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                # Calculate validation accuracy
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        
        # Check if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0  # Reset early stopping counter
            torch.save(model.state_dict(), save_path)  # Save best model
            print(f"Best model saved with val_loss: {best_val_loss:.4f}")
        # else:
        #     epochs_no_improve += 1
        
        # # Early stopping check
        # if epochs_no_improve >= patience:
        #     print("Early stopping triggered.")
        #     break
        
        print(f"Epoch [{epoch + 1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

In [21]:
num_epochs = 10
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)


100%|██████████| 30/30 [00:15<00:00,  1.90it/s]


Best model saved with val_loss: 0.7739
Epoch [1/10], Train Loss: 0.8781, Train Acc: 51.86%, Val Loss: 0.7739, Val Acc: 52.67%


100%|██████████| 30/30 [00:14<00:00,  2.08it/s]


Best model saved with val_loss: 0.6696
Epoch [2/10], Train Loss: 0.8622, Train Acc: 53.14%, Val Loss: 0.6696, Val Acc: 60.00%


100%|██████████| 30/30 [00:13<00:00,  2.28it/s]


Best model saved with val_loss: 0.6108
Epoch [3/10], Train Loss: 0.8004, Train Acc: 52.86%, Val Loss: 0.6108, Val Acc: 62.67%


100%|██████████| 30/30 [00:12<00:00,  2.32it/s]


Epoch [4/10], Train Loss: 0.7037, Train Acc: 57.86%, Val Loss: 0.6624, Val Acc: 62.67%


100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch [5/10], Train Loss: 0.6664, Train Acc: 62.43%, Val Loss: 0.7297, Val Acc: 51.33%


100%|██████████| 30/30 [00:12<00:00,  2.42it/s]


Epoch [6/10], Train Loss: 0.6192, Train Acc: 64.00%, Val Loss: 0.9732, Val Acc: 53.33%


100%|██████████| 30/30 [00:12<00:00,  2.39it/s]


Epoch [7/10], Train Loss: 0.6844, Train Acc: 63.14%, Val Loss: 0.6339, Val Acc: 60.67%


100%|██████████| 30/30 [00:13<00:00,  2.25it/s]


Epoch [8/10], Train Loss: 0.5802, Train Acc: 68.43%, Val Loss: 0.7147, Val Acc: 61.33%


100%|██████████| 30/30 [00:14<00:00,  2.13it/s]


Epoch [9/10], Train Loss: 0.5940, Train Acc: 67.14%, Val Loss: 0.6434, Val Acc: 58.67%


100%|██████████| 30/30 [00:14<00:00,  2.04it/s]


Epoch [10/10], Train Loss: 0.5585, Train Acc: 70.57%, Val Loss: 0.6164, Val Acc: 71.33%


## Test set eval

In [22]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_accuracy = 100 * correct / total
    print(f"Test Accuracy: {test_accuracy:.2f}%")

# Run evaluation
evaluate_model(model, test_loader)

Test Accuracy: 72.67%


## Load and Inference

In [37]:
def load_model(model, load_path="best_model.pth"):
    model.load_state_dict(torch.load(load_path, weights_only=True))
    model.eval()
    return model

In [38]:
def infer(model, image):
    model = load_model(model, load_path=r"saved_best_models\vgg_best_model.pth")
    image = image.to(device)
    with torch.no_grad():
        output = model(image.unsqueeze(0))
        _, predicted = torch.max(output, 1)
    return predicted.item()

In [39]:
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = data_transforms['test'](image)
    return image

In [40]:
image_path = "cats_vs_dogs_mini_dataset\dogs_set\dog.4014.jpg"
class_index = {value:key.split("_")[0] for key, value in train_dataset.dataset.class_to_idx.items()}
image_tensor = load_image(image_path)
predicted_label = infer(model, image_tensor)

print("Predicted Class:", class_index[predicted_label])

Predicted Class: dogs
