In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Step 1: Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Step 2: Define the architecture of the MLP
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_classes)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return self.softmax(x)

# Step 3: Instantiate the model
input_size = 32 * 32 * 3
hidden_size1 = 512
hidden_size2 = 256
num_classes = 10

model = MLP(input_size, hidden_size1, hidden_size2, num_classes)

# Step 4: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 5: Train the MLP model
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {(correct/total)*100:.2f}%')

# Step 6: Evaluate the model on the test set
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {(correct/total)*100:.2f}%')


Files already downloaded and verified
Files already downloaded and verified
Epoch [1/10], Loss: 2.1049, Accuracy: 34.87%
Epoch [2/10], Loss: 2.0661, Accuracy: 39.12%
Epoch [3/10], Loss: 2.0502, Accuracy: 40.77%
Epoch [4/10], Loss: 2.0468, Accuracy: 41.17%
Epoch [5/10], Loss: 2.0470, Accuracy: 41.18%
Epoch [6/10], Loss: 2.0377, Accuracy: 42.17%
Epoch [7/10], Loss: 2.0356, Accuracy: 42.36%
Epoch [8/10], Loss: 2.0373, Accuracy: 42.27%
Epoch [9/10], Loss: 2.0376, Accuracy: 42.27%
Epoch [10/10], Loss: 2.0368, Accuracy: 42.30%
Test Accuracy: 42.45%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Step 1: Load and preprocess the CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Step 2: Define the architecture of the CNN
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 8 * 8, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(nn.functional.relu(self.conv1(x)))
        x = self.pool(nn.functional.relu(self.conv2(x)))
        x = x.view(-1, 64 * 8 * 8)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Step 3: Instantiate the model
model = CNN()

# Step 4: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Step 5: Train the CNN model
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {(correct/total)*100:.2f}%')

# Step 6: Evaluate the model on the test set
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {(correct/total)*100:.2f}%')



Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:04<00:00, 40258306.67it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch [1/10], Loss: 1.4406, Accuracy: 48.18%
Epoch [2/10], Loss: 1.0535, Accuracy: 62.66%
Epoch [3/10], Loss: 0.8954, Accuracy: 68.48%
Epoch [4/10], Loss: 0.7915, Accuracy: 72.22%
Epoch [5/10], Loss: 0.7028, Accuracy: 75.35%
Epoch [6/10], Loss: 0.6302, Accuracy: 77.89%
Epoch [7/10], Loss: 0.5570, Accuracy: 80.44%
Epoch [8/10], Loss: 0.4873, Accuracy: 82.91%
Epoch [9/10], Loss: 0.4096, Accuracy: 85.85%
Epoch [10/10], Loss: 0.3495, Accuracy: 87.79%
Test Accuracy: 72.30%


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models

# Step 1: Load CIFAR-10 or CIFAR-100 dataset and define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to VGG input size
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Choose between CIFAR-10 and CIFAR-100
# CIFAR_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
CIFAR_dataset = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)

# Split the dataset into train and validation sets
train_size = int(0.8 * len(CIFAR_dataset))
val_size = len(CIFAR_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(CIFAR_dataset, [train_size, val_size])

# Create DataLoader for training, validation, and test sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(CIFAR_dataset, batch_size=32, shuffle=False)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 2: Load pre-trained VGG model
vgg = models.vgg16(pretrained=True).to(device)

# Step 3: Replace the final classification layer with new layer(s)
# For CIFAR-10 or CIFAR-100, we need to replace the final fully connected layer
num_features = vgg.classifier[6].in_features
vgg.classifier[6] = nn.Linear(num_features, len(CIFAR_dataset.classes)).to(device)

# Step 4: Freeze the weights of pre-trained layers
for param in vgg.features.parameters():
    param.requires_grad = False

# Step 5: Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(vgg.parameters(), lr=0.001, momentum=0.9)

# Step 6: Train the modified VGG model
num_epochs = 5

for epoch in range(num_epochs):
    vgg.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = vgg(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Training - Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {(correct/total)*100:.2f}%')

# Step 7: Evaluate the model on the test set
vgg.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = vgg(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {(correct/total)*100:.2f}%')


Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


100%|██████████| 169001437/169001437 [00:03<00:00, 42751676.53it/s]


Extracting ./data/cifar-100-python.tar.gz to ./data


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:06<00:00, 83.1MB/s]


Training - Epoch [1/5], Loss: 2.3413, Accuracy: 39.64%
Training - Epoch [2/5], Loss: 1.5816, Accuracy: 55.09%
Training - Epoch [3/5], Loss: 1.3380, Accuracy: 61.26%
Training - Epoch [4/5], Loss: 1.1555, Accuracy: 66.03%
Training - Epoch [5/5], Loss: 1.0014, Accuracy: 70.14%
Test Accuracy: 77.73%


The differences in performance among the MLP, CNN, and VGG-based models can be attributed to several factors:

Model Complexity: CNNs and VGG-based models are inherently more complex than MLPs. CNNs are specifically designed to handle image data and leverage the spatial structure of images for feature extraction. VGG, being a deeper CNN architecture, has more capacity to learn intricate patterns and features in images compared to MLPs. This increased model complexity allows CNNs and VGG to capture more nuanced information from images, leading to better performance.

Feature Extraction: CNNs, including the VGG architecture, utilize convolutional layers that are capable of capturing spatial hierarchies of features in images. These layers apply filters across small regions of the input image, enabling the model to detect local patterns such as edges, textures, and shapes. As the network progresses through the layers, it learns to extract increasingly complex and abstract features. In contrast, MLPs treat images as flattened vectors, thereby disregarding the spatial relationships between pixels. This limitation makes it challenging for MLPs to effectively capture the rich structure present in images.

Parameter Sharing and Pooling: CNNs benefit from parameter sharing and pooling operations, which contribute to their ability to generalize well to new data. Parameter sharing refers to the sharing of weights across different regions of the input image, which helps reduce the number of parameters in the model and makes it more robust to variations in the input. Pooling operations (e.g., max pooling) further downsample feature maps, retaining the most relevant information while reducing computational complexity. These operations are absent in MLPs, making them more prone to overfitting and less efficient in handling image data.

Transfer Learning (for VGG): VGG is a pre-trained CNN architecture that has been trained on large-scale image datasets such as ImageNet. Transfer learning involves leveraging the knowledge gained by a model trained on one task (e.g., ImageNet classification) and applying it to a related task (e.g., image classification on a different dataset). By using pre-trained weights from VGG as initialization, the model starts with learned features that are likely to be relevant for the new dataset. This initialization helps speed up convergence during training and can lead to better performance, especially when the new dataset is small or similar to the original dataset used for pre-training.

In summary, CNNs and VGG-based models outperform MLPs in image classification tasks due to their ability to capture spatial relationships, extract hierarchical features, and leverage parameter sharing and pooling operations. Transfer learning, in the case of VGG, further enhances performance by leveraging pre-trained representations learned from large-scale datasets.