In [1]:
import os
import torch
import torch.nn as nn
import torch.optim  as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
from tqdm import tqdm


The core idea behind InceptionNet is the Inception Module. Traditional CNNs stack layers like convolutions and pooling sequentially. However, in the Inception module, multiple convolution operations (with different filter sizes) and a pooling operation are performed in parallel within a single layer. These operations capture information at different spatial scales simultaneously, which improves feature extraction. The outputs are then concatenated along the depth (channel) dimension.

- 1x1 Convolution: This reduces the dimensionality of the data to prevent computational overload caused by deep networks.
- 3x3 Convolution: This helps capture medium-sized spatial features.
- 5x5 Convolution: This captures larger spatial features.
- Max Pooling: Helps in downsampling while retaining key features.

In [2]:
class GoogleNet(nn.Module):
    def __init__(self, in_channels=3,num_classes=1000):
        super(GoogleNet, self).__init__()        
        self.conv1 = conv_block(in_channels=in_channels, out_channels=64, kernel_size=(7,7),
                                stride=(2,2), padding=(3,3))
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.conv2 = conv_block(64, 192, kernel_size=3, stride=1, padding=1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        # In order: in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool
        self.inception3a = Inception_block(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = Inception_block(256, 128, 128, 192, 32,96, 64)
        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception4a = Inception_block(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = Inception_block(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = Inception_block(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = Inception_block(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = Inception_block(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception5a = Inception_block(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = Inception_block(832, 384, 192, 384, 48, 128, 128)

        self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
        self.dropout = nn.Dropout(p=0.4)
        self.linear = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        x = self.inception3a(x)
        x = self.inception3b(x)
        x = self.maxpool3(x)
        x = self.inception4a(x)
        x = self.inception4b(x)
        x = self.inception4c(x)
        x = self.inception4d(x)
        x = self.inception4e(x)
        x = self.maxpool4(x)
        x = self.inception5a(x)
        x = self.inception5b(x)
        x = self.avgpool(x)
        x = x.reshape(x.shape[0],-1)
        x = self.dropout(x)
        x = self.linear(x)

        return x


class Inception_block(nn.Module):
    def __init__(self, in_channels, out_1x1, red_3x3, out_3x3, red_5x5, out_5x5, out_1x1pool):
        super(Inception_block, self).__init__()

        self.branch1 = conv_block(in_channels, out_1x1, kernel_size=(1,1))
        self.branch2 = nn.Sequential(
            conv_block(in_channels, red_3x3, kernel_size=1),
            conv_block(red_3x3, out_3x3, kernel_size=3, stride=1,padding=1)
        )
        self.branch3 = nn.Sequential(
            conv_block(in_channels, red_5x5, kernel_size=1),
            conv_block(red_5x5, out_5x5, kernel_size=5, stride=1,padding=2)
        )
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            conv_block(in_channels, out_1x1pool, kernel_size=1, stride=1)
        )

    def forward(self, x):
        # batch x Filters x 28 x 28
        return torch.cat([self.branch1(x), self.branch2(x), self.branch3(x), self.branch4(x)], 1)

class conv_block(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(conv_block, self).__init__()
        self.relu = nn.ReLU()
        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        return self.relu(self.batchnorm(self.conv(x)))

In [3]:
x = torch.randn(1, 3, 224, 224)
model = GoogleNet()
print(model(x).shape)

torch.Size([1, 1000])


## Loading Dataset

In [4]:
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [5]:
data_dir = 'cats_vs_dogs_mini_dataset'
dataset = datasets.ImageFolder(os.path.join(data_dir), transform=data_transforms['train'])

In [6]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

In [7]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [8]:
val_dataset.dataset.transform = data_transforms['val']
test_dataset.dataset.transform = data_transforms['test']

In [9]:
batch_size = 24

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [10]:
train_data_iter = iter(train_loader)

# Get the next batch
images, labels = next(train_data_iter)
print(images.shape)
print(labels)
print(train_dataset.dataset.class_to_idx)

torch.Size([24, 3, 224, 224])
tensor([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0])
{'cats_set': 0, 'dogs_set': 1}


In [11]:
train_dataset.dataset.classes

['cats_set', 'dogs_set']

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [14]:
model = GoogleNet(images.shape[1], num_classes=len(train_dataset.dataset.classes)).to(device)

In [15]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## Training

In [16]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=3, save_path="saved_best_models/googlenet_best_model.pth"):
    best_val_loss = float('inf')  # Initialize best validation loss to a large value
    epochs_no_improve = 0  # Counter for early stopping
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        for images, labels in tqdm(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            train_loss += loss.item()
            
            # Backward and optimize
            loss.backward()
            optimizer.step()
            
            # Calculate train accuracy
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        train_loss /= len(train_loader)
        train_accuracy = 100 * correct / total
        
        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                
                outputs = model(images)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                # Calculate validation accuracy
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_accuracy = 100 * correct / total
        
        # Check if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0  # Reset early stopping counter
            torch.save(model.state_dict(), save_path)  # Save best model
            print(f"Best model saved with val_loss: {best_val_loss:.4f}")
        # else:
        #     epochs_no_improve += 1
        
        # # Early stopping check
        # if epochs_no_improve >= patience:
        #     print("Early stopping triggered.")
        #     break
        
        print(f"Epoch [{epoch + 1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

In [17]:
num_epochs = 10
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

100%|██████████| 30/30 [00:16<00:00,  1.84it/s]


Best model saved with val_loss: 0.7554
Epoch [1/10], Train Loss: 0.6778, Train Acc: 58.43%, Val Loss: 0.7554, Val Acc: 44.67%


100%|██████████| 30/30 [00:17<00:00,  1.74it/s]


Epoch [2/10], Train Loss: 0.5561, Train Acc: 71.14%, Val Loss: 0.9952, Val Acc: 61.33%


100%|██████████| 30/30 [00:21<00:00,  1.43it/s]


Epoch [3/10], Train Loss: 0.4811, Train Acc: 77.00%, Val Loss: 0.9331, Val Acc: 62.67%


100%|██████████| 30/30 [00:25<00:00,  1.19it/s]


Epoch [4/10], Train Loss: 0.3339, Train Acc: 85.00%, Val Loss: 1.6763, Val Acc: 50.67%


100%|██████████| 30/30 [00:19<00:00,  1.54it/s]


Epoch [5/10], Train Loss: 0.2856, Train Acc: 87.14%, Val Loss: 1.4459, Val Acc: 57.33%


100%|██████████| 30/30 [00:19<00:00,  1.54it/s]


Epoch [6/10], Train Loss: 0.2390, Train Acc: 90.71%, Val Loss: 0.8266, Val Acc: 64.67%


100%|██████████| 30/30 [00:18<00:00,  1.62it/s]


Epoch [7/10], Train Loss: 0.2813, Train Acc: 88.71%, Val Loss: 0.8432, Val Acc: 67.33%


100%|██████████| 30/30 [00:22<00:00,  1.35it/s]


Epoch [8/10], Train Loss: 0.2194, Train Acc: 90.86%, Val Loss: 1.3733, Val Acc: 58.67%


100%|██████████| 30/30 [00:22<00:00,  1.31it/s]


Epoch [9/10], Train Loss: 0.1304, Train Acc: 95.29%, Val Loss: 1.9211, Val Acc: 58.00%


100%|██████████| 30/30 [00:23<00:00,  1.27it/s]


Epoch [10/10], Train Loss: 0.1023, Train Acc: 95.71%, Val Loss: 1.1053, Val Acc: 68.67%


## Test set eval

In [18]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    test_accuracy = 100 * correct / total
    print(f"Test Accuracy: {test_accuracy:.2f}%")

# Run evaluation
evaluate_model(model, test_loader)

Test Accuracy: 70.67%


## Load and infer

In [19]:
def load_model(model, load_path="best_model.pth"):
    model.load_state_dict(torch.load(load_path, weights_only=True))
    model.eval()
    return model

In [20]:
def infer(model, image):
    model = load_model(model, load_path=r"saved_best_models\googlenet_best_model.pth")
    image = image.to(device)
    with torch.no_grad():
        output = model(image.unsqueeze(0))
        _, predicted = torch.max(output, 1)
    return predicted.item()

In [21]:
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    image = data_transforms['test'](image)
    return image

In [29]:
image_path = "cats_vs_dogs_mini_dataset\cats_set\cat.4001.jpg"
class_index = {value:key.split("_")[0] for key, value in train_dataset.dataset.class_to_idx.items()}
image_tensor = load_image(image_path)
predicted_label = infer(model, image_tensor)

print("Predicted Class:", class_index[predicted_label])

Predicted Class: cats
