CNN on fashion MNIST dataset.
1. Basic CNN.
2. Optimized CNN.
3. Using trained CNN.

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
from google.colab import drive
drive.mount('/content/drive')

Using device: cuda
Mounted at /content/drive


In [3]:
file_location_train = "/content/drive/MyDrive/PyTorch/Dataset/fashion-mnist_train.csv"
file_location_test = "/content/drive/MyDrive/PyTorch/Dataset/fashion-mnist_test.csv"
train_data = pd.read_csv(file_location_train)
test_data = pd.read_csv(file_location_test)
X_train = train_data.iloc[: , 1 : ].values
y_train = train_data.iloc[:, 0].values
X_test = test_data.iloc[ : , 1 : ].values
y_test = test_data.iloc[ : , 0].values
print(f"Shape of X_train, y_train, X_test, y_test: {X_train.shape, y_train.shape, X_test.shape, y_test.shape}")

Shape of X_train, y_train, X_test, y_test: ((60000, 784), (60000,), (10000, 784), (10000,))


In [4]:
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
class CustomDataset(Dataset):

  def __init__(self, features, labels):

    #Converting 1d input into 2d
    self.features = torch.tensor(features, dtype = torch.float32).reshape(-1, 1, 28, 28) # Here -1 --> place holder coz we don't know our batch size yet. 1 --> number of Channels.
    self.labels = torch.tensor(labels, dtype = torch.long)

  def __len__(self):
    return len(self.features)

  def __getitem__(self, idx):
    return self.features[idx], self.labels[idx]

In [None]:
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True, pin_memory = True)
test_loader = DataLoader(test_dataset, batch_size = 32, shuffle = True, pin_memory = True)

In [None]:
class MyNN(nn.Module):

  def __init__(self, input_features):
    super().__init__()

    self.features = nn.Sequential(
        nn.Conv2d(input_features, 32, kernel_size = 3, padding = 'same'),
        nn.BatchNorm2d(32),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2, stride = 2),

        nn.Conv2d(32, 64, kernel_size = 3, padding = 'same'),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size = 2, stride = 2)
    )
    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(64*7*7, 128),
        nn.ReLU(),
        nn.Dropout(p = 0.4),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Dropout(p = 0.4),

        nn.Linear(64, 10),
    )

  def forward(self, x):

    x = self.features(x)
    x = self.classifier(x)

    return x

In [None]:
learning_rate = 0.01
epochs = 100

In [None]:
model = MyNN(1)

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=1e-4)

In [None]:
# training loop

for epoch in range(epochs):

  total_epoch_loss = 0

  for batch_features, batch_labels in train_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    # forward pass
    outputs = model(batch_features)

    # calculate loss
    loss = criterion(outputs, batch_labels)

    # back pass
    optimizer.zero_grad()
    loss.backward()

    # update grads
    optimizer.step()

    total_epoch_loss = total_epoch_loss + loss.item()

  avg_loss = total_epoch_loss/len(train_loader)
  print(f'Epoch: {epoch + 1} , Loss: {avg_loss}')

Epoch: 1 , Loss: 0.6268563764969508
Epoch: 2 , Loss: 0.41080416830380756
Epoch: 3 , Loss: 0.34822434592644375
Epoch: 4 , Loss: 0.3114048192679882
Epoch: 5 , Loss: 0.2893591184139252
Epoch: 6 , Loss: 0.27568318490882715
Epoch: 7 , Loss: 0.2557085605661074
Epoch: 8 , Loss: 0.24452517103354135
Epoch: 9 , Loss: 0.23053308239926895
Epoch: 10 , Loss: 0.22152272637784481
Epoch: 11 , Loss: 0.21389741609891255
Epoch: 12 , Loss: 0.20482621880571047
Epoch: 13 , Loss: 0.20119932728509107
Epoch: 14 , Loss: 0.19400162131289642
Epoch: 15 , Loss: 0.18478751401901244
Epoch: 16 , Loss: 0.17999691482335328
Epoch: 17 , Loss: 0.17381538389002282
Epoch: 18 , Loss: 0.16786805733342966
Epoch: 19 , Loss: 0.16518814025223255
Epoch: 20 , Loss: 0.1568270253246029
Epoch: 21 , Loss: 0.1555611581961314
Epoch: 22 , Loss: 0.15201763098413745
Epoch: 23 , Loss: 0.147805726424853
Epoch: 24 , Loss: 0.1450796476751566
Epoch: 25 , Loss: 0.1392447899316748
Epoch: 26 , Loss: 0.14015088228322564
Epoch: 27 , Loss: 0.13878519051

In [None]:
model.eval()

MyNN(
  (features): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=same)
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3136, out_features=128, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=128, out_features=64, bias=True)
    (5): ReLU()
    (6): Dropout(p=0.4, inplace=False)
    (7): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [None]:
# evaluation on test data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in test_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9273


In [None]:
# evaluation on training data
total = 0
correct = 0

with torch.no_grad():

  for batch_features, batch_labels in train_loader:

    # move data to gpu
    batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    outputs = model(batch_features)

    _, predicted = torch.max(outputs, 1)

    total = total + batch_labels.shape[0]

    correct = correct + (predicted == batch_labels).sum().item()

print(correct/total)

0.9956166666666667


## Here we can see that our model is overfitting. To solve that problem we are going to do some changes in our code.

1. Data Augmentation.
2. Optuna.
  - Number of conv. layers.
  - Number of filters.
  - Kernel size.
  - Number of fully connected layer (fc).
  - Dropout rate.
  - Weight decay.
  - Learning rate.
  - optimizer name.
  - batch_size.
  - num_epochs.

In [5]:
from torchvision import transforms

# Define data augmentations for the training dataset
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(p = 0.5),
    transforms.RandomAffine(0, translate = (0.1, 0.1)),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

In [6]:
class CustomDataset(Dataset):

  def __init__(self, features, labels, transform = None):
    self.features = torch.tensor(features, dtype = torch.float32).reshape(-1, 1, 28, 28)
    self.labels = torch.tensor(labels, dtype = torch.long)
    self.transform = transform

  def __len__(self):
    return len(self.features)

  def __getitem__(self, index):
    feature, label = self.features[index], self.labels[index]
    if self.transform:
      feature =  self.transform(feature.squeeze(0).numpy())  # Transformation applied here.
    return feature, label

In [7]:
# Use augmentations in training data

train_dataset = CustomDataset(X_train, y_train, transform = train_transform)
test_dataset = CustomDataset(X_test, y_test, transform = test_transform)

In [8]:
# Define dynamic CNN model class

class DynamicCNN(nn.Module):
  def __init__(self, num_conv_layers, num_filters, kernel_size, num_fc_layers, fc_layer_size, dropout_rate):
    super(DynamicCNN, self).__init__()
    layers = []
    in_channels = 1        # Grayscale images have 1 input channel

    # Convolutional layers
    for _ in range(num_conv_layers):
      layers.append(nn.Conv2d(in_channels, num_filters, kernel_size = kernel_size, padding = 'same'))
      layers.append(nn.BatchNorm2d(num_filters))
      layers.append(nn.ReLU())
      layers.append(nn.MaxPool2d(kernel_size = 2, stride = 2))
      in_channels = num_filters  # Update input channel for next layer

    self.features = nn.Sequential(*layers)

    # fully connected layers
    fc_layers = [nn.Flatten()]
    input_size = num_filters * (28 // (2 ** num_conv_layers)) ** 2
    for _ in range(num_fc_layers):
      fc_layers.append(nn.Linear(input_size, fc_layer_size))
      fc_layers.append(nn.ReLU())
      fc_layers.append(nn.Dropout(dropout_rate))
      input_size = fc_layer_size
    fc_layers.append(nn.Linear(input_size, 10))    # Final layer for 10 classes

    self.classifier = nn.Sequential(*fc_layers)

  def forward(self, x):
    x = self.features(x)
    x = self.classifier(x)
    return x

In [9]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [12]:
import optuna

In [10]:
# Define the objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    num_conv_layers = trial.suggest_int('num_conv_layers', 1, 3)
    num_filters = trial.suggest_categorical('num_filters', [16, 32, 64, 128])
    kernel_size = trial.suggest_categorical('kernel_size', [3, 5])
    num_fc_layers = trial.suggest_int('num_fc_layers', 1, 3)
    fc_layer_size = trial.suggest_categorical('fc_layer_size', [64, 128, 256])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    optimizer_name = trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop'])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_epochs = trial.suggest_int('num_epochs', 10, 30)

    # Model
    model = DynamicCNN(num_conv_layers, num_filters, kernel_size, num_fc_layers, fc_layer_size, dropout_rate).to(device)

    # Data
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Optimizer
    if optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    # Loss function
    criterion = nn.CrossEntropyLoss()

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            optimizer.step()

    # Validation loop
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()

    accuracy = correct / total
    return accuracy

In [None]:
# Run the Optuna study
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=50)  # Run 50 trials

[I 2025-06-08 22:38:02,773] A new study created in memory with name: no-name-a555c82a-c390-463b-974c-00ef918dd8f0
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
[I 2025-06-08 22:50:42,488] Trial 0 finished with value: 0.8907 and parameters: {'num_conv_layers': 2, 'num_filters': 64, 'kernel_size': 3, 'num_fc_layers': 1, 'fc_layer_size': 64, 'dropout_rate': 0.37495797280676824, 'weight_decay': 0.00015167182866328903, 'learning_rate': 0.007964721148261892, 'optimizer': 'Adam', 'batch_size': 128, 'num_epochs': 30}. Best is trial 0 with value: 0.8907.
[I 2025-06-08 23:03:37,670] Trial 1 finished with value: 0.8835 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'kernel_size': 5, 'num_fc_layers': 3, 'fc_layer_size': 256, 'dropout_rate': 0.30935648294407625, 'weight_decay': 0.00021532768391958215, 'learning_rate': 0.00252491

In [None]:
# Print the best hyperparameters and accuracy
print("Best hyperparameters:", study.best_params)
print("Best accuracy:", study.best_value)