In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sea
from tqdm.notebook import tqdm
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms
import cv2
import wandb

from sklearn.datasets import fetch_openml
from scipy.io import loadmat

device = 'cuda' if torch.cuda.is_available() else 'cpu'

plt.style.use('seaborn')
np.__version__, device

('1.21.2', 'cuda')

In [4]:
PATH = '/scratch/fk/permuted_mnist.npz'

def load_data(PATH, split):
    data = list(np.load(PATH).values())
    return data[split], data[split + 1]
 
X_train, y_train = load_data(PATH, split=0)
X_val, y_val = load_data(PATH, split=2)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

 

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [5]:
def load_image(file):
    img = cv2.resize(file, (32, 32))
    return img


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(0, 1)
])

class PermtedMNISTDataset:
    def __init__(self, X, y, transform=None):
        self.X, self.y = X, y
        self.n_samples = len(self.X)
        self.transform = transform 
        
    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        x = load_image(self.X[idx])
        if self.transform:
            x = self.transform(x)
            
        y = self.y[idx]        
        return x, torch.tensor(y, dtype=torch.int64)
        

In [6]:
train_dataset = PermtedMNISTDataset(X_train, y_train, transform)
val_dataset = PermtedMNISTDataset(X_val, y_val, transform)

batch_size = 512

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



In [7]:
a, b = next(iter(train_loader))

a.shape, b.shape

(torch.Size([512, 1, 32, 32]), torch.Size([512]))

In [27]:
class MlpModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.ModuleList(
            [
                self.get_block(1024, 512),
                self.get_block(512, 256),
                self.get_block(256, 128),
                self.get_block(128, 64)       
            ]
        )
        
        self.drop = nn.Dropout(0.2)
        self.classiifer = nn.Linear(64, 10)
    
    def get_block(self, in_c, out_c):
        return nn.Sequential(
            nn.Linear(in_c, out_c, bias=False),
            nn.BatchNorm1d(out_c),
            nn.ReLU(),
            nn.Dropout(0.2)
        )    
        
    def forward(self, x):
        x = x.view(-1, 32*32)
        
        for i, layer in enumerate(self.layers):
            x = layer(x)
            
        return self.classiifer(x)
    


In [29]:
from sklearn.metrics import accuracy_score

class Trainer:
    def __init__(self,
                 model,
                 train_loader,
                 val_loader,
                 device,
                 loss_fxn, 
                 logger,
                 params):

        self.device = device
        self.params = params        
        self.model = model.to(self.device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.loss_fxn = loss_fxn
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = self.params['lr'])
        self.logger = logger
        
    def training_step(self, x, y):
        y_pred = self.model(x)
        loss = self.loss_fxn(y_pred, y)
        y_pred_bin = (y_pred > 0.5).to(torch.int64)
        acc = accuracy_score(torch.argmax(y_pred_bin.detach().cpu(), -1), y.detach().cpu())
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss, acc
    
    def val_step(self, x, y):
        with torch.no_grad():
            y_pred = self.model(x)
            
        loss = self.loss_fxn(y_pred, y)
        y_pred_bin = (y_pred > 0.5).to(torch.int64)
        acc = accuracy_score(torch.argmax(y_pred_bin.detach().cpu(), -1), y.detach().cpu())
        
        return loss, acc
    
    def go_over_one_batch(self, loader, step_fxn):
        loss, acc = 0, 0
        for x, y in tqdm(loader):
            x, y = x.to(self.device), y.to(self.device)
            l, a = step_fxn(x, y)
            loss, acc = loss + l, acc + a
            
        return loss/len(loader), acc/len(loader)
    
    def train(self, epochs = 10):
        for epoch in tqdm(range(epochs)):
            
            train_loss, train_acc = self.go_over_one_batch(self.train_loader, self.training_step)
            val_loss, val_acc = self.go_over_one_batch(self.val_loader, self.val_step)
        
            print(f"[Epoch: {epoch}] Training:[loss:{train_loss:.4f} acc:{train_acc:.3f}] Val:[loss:{val_loss:.4f} acc:{val_acc:.3f}]" )
            if self.logger:
                self.logger.log({
                    'train_loss':train_loss,
                    'val_loss':val_loss,
                    'train_acc': train_acc,
                    'val_acc': val_acc
                })
         
        if self.logger:       
            self.logger.finish()
            

In [None]:
params = {
    'lr' : 1e-3,
    'batch_size':512,
    'epoch': 10,    
}


wandb.init(
    project="SMAI-A3-permuted-mnist-ANN",
    config=params,
    name=f"3-layer|tanh"
)

trainer = Trainer(
    model = MlpModel(),
    train_loader = train_loader,
    val_loader = val_loader, 
    device = device, 
    loss_fxn = nn.CrossEntropyLoss(),
    logger=wandb,
    params=params
)

trainer.train()

In [30]:
def test_model(model, loader):
    acc = 0
    for x, y in tqdm(loader):
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            y_pred = model(x)
            acc += accuracy_score(torch.argmax(y_pred.detach().cpu(), -1), y.detach().cpu())
    
    return acc/len(loader)

params = {
    'lr' : 1e-3,
    'batch_size':512,
    'epoch': 10,    
}

trainer = Trainer(
    model = MlpModel(),
    train_loader = train_loader,
    val_loader = val_loader, 
    device = device, 
    loss_fxn = nn.CrossEntropyLoss(),
    logger=None,
    params=params
)

trainer.train()


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 0] Training:[loss:0.5357 acc:0.534] Val:[loss:0.2692 acc:0.533]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 1] Training:[loss:0.2204 acc:0.517] Val:[loss:0.1933 acc:0.515]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 2] Training:[loss:0.1638 acc:0.504] Val:[loss:0.1624 acc:0.514]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 3] Training:[loss:0.1307 acc:0.496] Val:[loss:0.1456 acc:0.475]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 4] Training:[loss:0.1128 acc:0.495] Val:[loss:0.1480 acc:0.495]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 5] Training:[loss:0.0963 acc:0.482] Val:[loss:0.1303 acc:0.491]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 6] Training:[loss:0.0869 acc:0.482] Val:[loss:0.1215 acc:0.480]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 7] Training:[loss:0.0752 acc:0.481] Val:[loss:0.1165 acc:0.471]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 8] Training:[loss:0.0690 acc:0.479] Val:[loss:0.1070 acc:0.488]


  0%|          | 0/118 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

[Epoch: 9] Training:[loss:0.0628 acc:0.468] Val:[loss:0.1153 acc:0.488]


  0%|          | 0/118 [00:00<?, ?it/s]

Training accuracy: 0.9828411899717514


  0%|          | 0/20 [00:00<?, ?it/s]

Validation accuracy 0.9650390625


In [32]:
class CNNModel(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.params = params
                
        self.layers = nn.ModuleList(
            [
                self.get_block(1, 32, kernel_size=self.params['kernel_size'], padding=self.params['padding']), # 32, 16, 16
                self.get_block(32, 64, kernel_size=self.params['kernel_size'], padding=self.params['padding']), # 64, 8, 8 
                self.get_block(64, 128, kernel_size=self.params['kernel_size'], padding=self.params['padding']), # 128, 4, 4
                self.get_block(128, 256, kernel_size=self.params['kernel_size'], padding=self.params['padding']), # 256, 2, 2
            ]
        )
        
        self.fc1 = nn.Linear(256 * 2 * 2, 256 * 1 * 1)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)
        self.drop = nn.Dropout(0.2)
        
    def get_block(self, in_c, out_c, kernel_size=3, padding=1):
        return nn.Sequential(
            nn.Conv2d(in_c, in_c, kernel_size=kernel_size, stride=1, padding=padding, bias=not self.params['batchnorm']),
            nn.BatchNorm2d(in_c) if self.params['batchnorm'] else nn.Identity(),
            nn.ReLU(),
            
            nn.Conv2d(in_c, out_c, kernel_size=kernel_size, stride=1, padding=padding, bias=not self.params['batchnorm']),
            nn.BatchNorm2d(out_c) if self.params['batchnorm'] else nn.Identity(),
            nn.ReLU(),
            
            nn.MaxPool2d(2),
            nn.Dropout(0.2) if self.params['dropout'] else nn.Identity()
        )
        
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            
        x = x.view(-1, 256*2*2)
        x = self.drop(nn.LeakyReLU()(self.fc1(x)))
        x = self.drop(nn.LeakyReLU()(self.fc2(x)))
        return self.fc3(x)
        
        

In [34]:

params = {
    'lr' : 1e-3,
    'batch_size':32,
    'epoch': 10,
    'dropout': True,
    'batchnorm': True,
    'kernel_size': 3,
    'padding': 1,   
}


wandb.init(
    project="SMAI-A3-permuted-mnist-CNN",
    config=params,
    name=f"lr={1e-3}"
)

train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=params['batch_size'], shuffle=False)


trainer = Trainer(
    model = CNNModel(params),
    train_loader = train_loader,
    val_loader = val_loader, 
    device = device, 
    loss_fxn = nn.CrossEntropyLoss(),
    logger=None,
    params=params
)

trainer.train()


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

[34m[1mwandb[0m: wandb version 0.15.12 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 0] Training:[loss:0.6290 acc:0.475] Val:[loss:0.3883 acc:0.552]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 1] Training:[loss:0.2901 acc:0.539] Val:[loss:0.2481 acc:0.561]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 2] Training:[loss:0.2354 acc:0.565] Val:[loss:0.1988 acc:0.540]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 3] Training:[loss:0.1984 acc:0.607] Val:[loss:0.2146 acc:0.629]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 4] Training:[loss:0.1759 acc:0.654] Val:[loss:0.1805 acc:0.632]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 5] Training:[loss:0.1574 acc:0.656] Val:[loss:0.1929 acc:0.720]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 6] Training:[loss:0.1413 acc:0.725] Val:[loss:0.1899 acc:0.767]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 7] Training:[loss:0.1320 acc:0.754] Val:[loss:0.1645 acc:0.748]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 8] Training:[loss:0.1175 acc:0.796] Val:[loss:0.1661 acc:0.779]


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

[Epoch: 9] Training:[loss:0.1082 acc:0.839] Val:[loss:0.2172 acc:0.847]


### Analysis

1. For Double MNIST, CNN performed better than MLP, obviously because CNN filters were able to learn the features better than the MLP. 
   For Permuted MNIST, CNN didnot perform better as there was no spatial relationship in the images (since the pixels were permuted randomly). Hence, a MLP, which takes in all the pixels as a 
   flatten vector was able to learn the permutation mapping and hence performed better.
   
2. Observations:
    1. For Double MNIST, training a MLP is a difficult task, since now it has to classify not one but two digits in the dataset. Generally, classifying a single digit is already difficult 
    with MLP, so achieving a good accuracy with MLP on this task is quite difficult. But, a CNN could easily learn the features since it uses filters and shared parameters, hence achieiving a
    high score.
    2. For Permuted MNIST, the images no longer have any spatial relationship since the pixels are permuted randomly. Hence, using a MLP is a good idea for this task, since it doesn't 
    look for spatial relationships in the data. On the other hand, a CNN model wouldn't work well in this data becuase of no spatial information. This observation is reflected in the loss curves of Section 5 logs too.

<break>
3. We believe, it is possible to overfit on the Double MNIST data using a CNN because it performs well on the dataset and similarly, we can overfit MLP on the Permuted dataset. This can be 
   experimented by training the models for a large number of epochs (this couldn't be completed because of lack of compute power).

