# MINI PROJECT 4



In [None]:
pip install einops

Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Installing collected packages: einops
Successfully installed einops-0.4.1


In [None]:
import PIL
import time
import torch
import torchvision
import torch.nn.functional as F
from einops import rearrange
from torch import nn
import torch.nn.init as init

In [None]:
def _weights_init(m):
    classname = m.__class__.__name__
    #print(classname)
    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight)

class LambdaLayer(nn.Module):
    def __init__(self, lambd):
        super(LambdaLayer, self).__init__()
        self.lambd = lambd

    def forward(self, x):
        return self.lambd(x)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1, option='A'):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            if option == 'A':
                """
                For CIFAR10 ResNet paper uses option A.
                """
                self.shortcut = LambdaLayer(lambda x:
                                            F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), "constant", 0))
            elif option == 'B':
                self.shortcut = nn.Sequential(
                     nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
                     nn.BatchNorm2d(self.expansion * planes)
                )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        #print(out.size())
        return out



class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

class LayerNormalize(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class MLP_Block(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.1):
        super().__init__()
        self.nn1 = nn.Linear(dim, hidden_dim)
        torch.nn.init.xavier_uniform_(self.nn1.weight)
        torch.nn.init.normal_(self.nn1.bias, std = 1e-6)
        self.af1 = nn.GELU()
        self.do1 = nn.Dropout(dropout)
        self.nn2 = nn.Linear(hidden_dim, dim)
        torch.nn.init.xavier_uniform_(self.nn2.weight)
        torch.nn.init.normal_(self.nn2.bias, std = 1e-6)
        self.do2 = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.nn1(x)
        x = self.af1(x)
        x = self.do1(x)
        x = self.nn2(x)
        x = self.do2(x)
        
        return x

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dropout = 0.1):
        super().__init__()
        self.heads = heads
        self.scale = dim ** -0.5  # 1/sqrt(dim)

        self.to_qkv = nn.Linear(dim, dim * 3, bias = True) # Wq,Wk,Wv for each vector, thats why *3
        torch.nn.init.xavier_uniform_(self.to_qkv.weight)
        torch.nn.init.zeros_(self.to_qkv.bias)
        
        self.nn1 = nn.Linear(dim, dim)
        torch.nn.init.xavier_uniform_(self.nn1.weight)
        torch.nn.init.zeros_(self.nn1.bias)        
        self.do1 = nn.Dropout(dropout)
        

    def forward(self, x, mask = None):
        b, n, _, h = *x.shape, self.heads
        qkv = self.to_qkv(x) #gets q = Q = Wq matmul x1, k = Wk mm x2, v = Wv mm x3
        q, k, v = rearrange(qkv, 'b n (qkv h d) -> qkv b h n d', qkv = 3, h = h) # split into multi head attentions

        dots = torch.einsum('bhid,bhjd->bhij', q, k) * self.scale

        if mask is not None:
            mask = F.pad(mask.flatten(1), (1, 0), value = True)
            assert mask.shape[-1] == dots.shape[-1], 'mask has incorrect dimensions'
            mask = mask[:, None, :] * mask[:, :, None]
            dots.masked_fill_(~mask, float('-inf'))
            del mask

        attn = dots.softmax(dim=-1) #follow the softmax,q,d,v equation in the paper

        out = torch.einsum('bhij,bhjd->bhid', attn, v) #product of v times whatever inside softmax
        out = rearrange(out, 'b h n d -> b n (h d)') #concat heads into one matrix, ready for next encoder block
        out =  self.nn1(out)
        out = self.do1(out)
        return out

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, mlp_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Residual(LayerNormalize(dim, Attention(dim, heads = heads, dropout = dropout))),
                Residual(LayerNormalize(dim, MLP_Block(dim, mlp_dim, dropout = dropout)))
            ]))
    def forward(self, x, mask = None):
        for attention, mlp in self.layers:
            x = attention(x, mask = mask) # go to attention
            x = mlp(x) #go to MLP_Block
        return x
     

class ViTResNet(nn.Module):
    # initial attributes
    #def __init__(self, block, num_blocks, num_classes=10, dim = 128, num_tokens = 8, mlp_dim = 256, heads = 8, depth = 6, emb_dropout = 0.1, dropout= 0.1):
    #def __init__(self, block, num_blocks, num_classes=50, dim = 128, num_tokens = 8, mlp_dim = 256, heads = 8, depth = 6, emb_dropout = 0.1, dropout= 0.1):
    def __init__(self, block, num_blocks, num_classes=10, dim = 128, num_tokens = 16, mlp_dim = 256, heads = 8, depth = 6, emb_dropout = 0.1, dropout= 0.1):
        super(ViTResNet, self).__init__()
        self.in_planes = 16
        self.L = num_tokens
        self.cT = dim
        
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2) #8x8 feature maps (64 in total)
        self.apply(_weights_init)
        
        
        # Tokenization
        self.token_wA = nn.Parameter(torch.empty(BATCH_SIZE_TRAIN,self.L, 64),requires_grad = True) #Tokenization parameters
        torch.nn.init.xavier_uniform_(self.token_wA)
        self.token_wV = nn.Parameter(torch.empty(BATCH_SIZE_TRAIN,64,self.cT),requires_grad = True) #Tokenization parameters
        torch.nn.init.xavier_uniform_(self.token_wV)        
             
        
        self.pos_embedding = nn.Parameter(torch.empty(1, (num_tokens + 1), dim))
        torch.nn.init.normal_(self.pos_embedding, std = .02) # initialized based on the paper

        #self.patch_conv= nn.Conv2d(64,dim, self.patch_size, stride = self.patch_size) 

        self.cls_token = nn.Parameter(torch.zeros(1, 1, dim)) #initialized based on the paper
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, mlp_dim, dropout)

        self.to_cls_token = nn.Identity()

        self.nn1 = nn.Linear(dim, num_classes)  # if finetuning, just use a linear layer without further hidden layers (paper)
        torch.nn.init.xavier_uniform_(self.nn1.weight)
        torch.nn.init.normal_(self.nn1.bias, std = 1e-6)


    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion

        return nn.Sequential(*layers)
    
    
        
    def forward(self, img, mask = None):
        x = F.relu(self.bn1(self.conv1(img)))
        x = self.layer1(x)
        x = self.layer2(x)  
        x = self.layer3(x) 
        
        x = rearrange(x, 'b c h w -> b (h w) c') # 64 vectors each with 64 points. These are the sequences or word vecotrs like in NLP

        #Tokenization 
        wa = rearrange(self.token_wA, 'b h w -> b w h') #Transpose
        A= torch.einsum('bij,bjk->bik', x, wa) 
        A = rearrange(A, 'b h w -> b w h') #Transpose
        A = A.softmax(dim=-1)

        VV= torch.einsum('bij,bjk->bik', x, self.token_wV)       
        T = torch.einsum('bij,bjk->bik', A, VV)  
        #print(T.size())

        cls_tokens = self.cls_token.expand(img.shape[0], -1, -1)
        x = torch.cat((cls_tokens, T), dim=1)
        x += self.pos_embedding
        x = self.dropout(x)
        x = self.transformer(x, mask) #main game
        x = self.to_cls_token(x[:, 0])       
        x = self.nn1(x)
        
        
        return x


BATCH_SIZE_TRAIN = 100
BATCH_SIZE_TEST = 100

DL_PATH = "C:\Pytorch\Spyder\CIFAR10_data" # Use your own path
# CIFAR10: 60000 32x32 color images in 10 classes, with 6000 images per class
transform = torchvision.transforms.Compose(
     [torchvision.transforms.RandomHorizontalFlip(),
     torchvision.transforms.RandomRotation(10, resample=PIL.Image.BILINEAR),
     torchvision.transforms.RandomAffine(8, translate=(.15,.15)),
     torchvision.transforms.ToTensor(),
     torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])


train_dataset = torchvision.datasets.CIFAR10(DL_PATH, train=True,
                                        download=True, transform=transform)

test_dataset = torchvision.datasets.CIFAR10(DL_PATH, train=False,
                                       download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN,
                                          shuffle=True)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST,
                                         shuffle=False)

def train(model, optimizer, data_loader, loss_history):
    total_samples = len(data_loader.dataset)
    model.train()

    for i, (data, target) in enumerate(data_loader):
        optimizer.zero_grad()
        output = F.log_softmax(model(data), dim=1)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print('[' +  '{:5}'.format(i * len(data)) + '/' + '{:5}'.format(total_samples) +
                  ' (' + '{:3.0f}'.format(100 * i / len(data_loader)) + '%)]  Loss: ' +
                  '{:6.4f}'.format(loss.item()))
            loss_history.append(loss.item())
            
def evaluate(model, data_loader, loss_history):
    model.eval()
    
    total_samples = len(data_loader.dataset)
    correct_samples = 0
    total_loss = 0

    with torch.no_grad():
        for data, target in data_loader:
            output = F.log_softmax(model(data), dim=1)
            loss = F.nll_loss(output, target, reduction='sum')
            _, pred = torch.max(output, dim=1)
            
            total_loss += loss.item()
            correct_samples += pred.eq(target).sum()

    avg_loss = total_loss / total_samples
    loss_history.append(avg_loss)
    print('\nAverage test loss: ' + '{:.4f}'.format(avg_loss) +
          '  Accuracy:' + '{:5}'.format(correct_samples) + '/' +
          '{:5}'.format(total_samples) + ' (' +
          '{:4.2f}'.format(100.0 * correct_samples / total_samples) + '%)\n')
    
# model = ViT()
# model.load_state_dict(torch.load(PATH))
# model.eval()

  "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. "


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to C:\Pytorch\Spyder\CIFAR10_data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting C:\Pytorch\Spyder\CIFAR10_data/cifar-10-python.tar.gz to C:\Pytorch\Spyder\CIFAR10_data
Files already downloaded and verified


In [None]:
print(train_loader.dataset[0])

(tensor([[[-2.1179, -2.1179, -2.1179,  ...,  0.4166,  0.5878,  0.5878],
         [-2.1179, -2.1179, -2.1179,  ...,  0.0912,  0.1254,  0.1597],
         [-2.1179, -2.1179, -2.1179,  ...,  0.0741,  0.1254,  0.2796],
         ...,
         [-2.1179, -2.1179, -2.1179,  ...,  0.2111, -1.1932, -1.3130],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179],
         [-2.1179, -2.1179, -2.1179,  ..., -2.1179, -2.1179, -2.1179]],

        [[-2.0357, -2.0357, -2.0357,  ..., -0.0749,  0.1352,  0.1877],
         [-2.0357, -2.0357, -2.0357,  ..., -0.4951, -0.4426, -0.3901],
         [-2.0357, -2.0357, -2.0357,  ..., -0.4776, -0.4251, -0.2325],
         ...,
         [-2.0357, -2.0357, -2.0357,  ..., -0.1275, -1.4930, -1.5105],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357],
         [-2.0357, -2.0357, -2.0357,  ..., -2.0357, -2.0357, -2.0357]],

        [[-1.8044, -1.8044, -1.8044,  ..., -0.4624, -0.2010, -0.0790],
         [-1.8044, -1.8044, -1.8044,  ..., -

##Results with test and train batch size of 100 each and 3 epochs

In [None]:
N_EPOCHS = 3


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torcpooling-based tokenizerh.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 593.38 seconds

Average test loss: 1.9171  Accuracy: 2231/10000 (22.31%)

Epoch: 2
Execution time: 618.46 seconds

Average test loss: 1.8000  Accuracy: 2745/10000 (27.45%)

Epoch: 3
Execution time: 544.75 seconds

Average test loss: 1.6489  Accuracy: 3483/10000 (34.83%)

Execution time


##Results with test and train batch size of 100 each and 5 epochs 

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 543.65 seconds

Average test loss: 1.9778  Accuracy: 2197/10000 (21.97%)

Epoch: 2
Execution time: 568.23 seconds

Average test loss: 1.8378  Accuracy: 2392/10000 (23.92%)

Epoch: 3
Execution time: 546.65 seconds

Average test loss: 1.6089  Accuracy: 3792/10000 (37.92%)

Epoch: 4
Execution time: 518.73 seconds

Average test loss: 1.3643  Accuracy: 4923/10000 (49.23%)

Epoch: 5
Execution time: 523.29 seconds

Average test loss: 1.2939  Accuracy: 5417/10000 (54.17%)

Execution time


## Results with test and train batch size of 100 each and 64 tokens

Token efficiency ablation

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 1161.41 seconds

Average test loss: 1.9177  Accuracy: 2279/10000 (22.79%)

Epoch: 2
Execution time: 1160.20 seconds

Average test loss: 1.7286  Accuracy: 2852/10000 (28.52%)

Epoch: 3
Execution time: 1170.30 seconds

Average test loss: 1.5955  Accuracy: 3759/10000 (37.59%)

Epoch: 4
Execution time: 1175.93 seconds

Average test loss: 1.3232  Accuracy: 5167/10000 (51.67%)

Epoch: 5
Execution time: 1164.55 seconds

Average test loss: 1.3099  Accuracy: 5449/10000 (54.49%)

Execution time


## Results with test and train batch size of 100 each and 32 tokens

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 844.61 seconds

Average test loss: 1.9624  Accuracy: 2282/10000 (22.82%)

Epoch: 2
Execution time: 839.99 seconds

Average test loss: 1.7757  Accuracy: 2833/10000 (28.33%)

Epoch: 3
Execution time: 849.35 seconds

Average test loss: 1.5853  Accuracy: 3839/10000 (38.39%)

Epoch: 4
Execution time: 856.90 seconds

Average test loss: 1.4930  Accuracy: 4544/10000 (45.44%)

Epoch: 5
Execution time: 857.22 seconds

Average test loss: 1.3826  Accuracy: 5161/10000 (51.61%)

Execution time


## Results with test and train batch size of 100 each and 16 tokens

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 627.75 seconds

Average test loss: 1.9484  Accuracy: 2220/10000 (22.20%)

Epoch: 2
Execution time: 633.89 seconds

Average test loss: 1.7614  Accuracy: 2954/10000 (29.54%)

Epoch: 3
Execution time: 637.28 seconds

Average test loss: 1.6007  Accuracy: 3757/10000 (37.57%)

Epoch: 4
Execution time: 631.74 seconds

Average test loss: 1.5612  Accuracy: 4496/10000 (44.96%)

Epoch: 5
Execution time: 635.17 seconds

Average test loss: 1.4095  Accuracy: 5145/10000 (51.45%)

Execution time


##Results with test and batch size of 100 each and 20 classes

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 571.51 seconds

Average test loss: 2.0283  Accuracy: 2002/10000 (20.02%)

Epoch: 2
Execution time: 509.81 seconds

Average test loss: 1.8256  Accuracy: 2649/10000 (26.49%)

Epoch: 3
Execution time: 509.74 seconds

Average test loss: 1.5594  Accuracy: 3839/10000 (38.39%)

Epoch: 4
Execution time: 503.05 seconds

Average test loss: 1.4430  Accuracy: 4634/10000 (46.34%)

Epoch: 5
Execution time: 654.09 seconds

Average test loss: 1.2646  Accuracy: 5609/10000 (56.09%)

Execution time


##Results with test and batch size of 100 each and 50 classes

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 584.48 seconds

Average test loss: 1.9692  Accuracy: 2253/10000 (22.53%)

Epoch: 2
Execution time: 564.22 seconds

Average test loss: 1.9300  Accuracy: 2597/10000 (25.97%)

Epoch: 3
Execution time: 533.62 seconds

Average test loss: 1.6460  Accuracy: 3632/10000 (36.32%)

Epoch: 4
Execution time: 528.12 seconds

Average test loss: 1.4219  Accuracy: 4494/10000 (44.94%)

Epoch: 5
Execution time: 511.63 seconds

Average test loss: 1.3865  Accuracy: 4955/10000 (49.55%)

Execution time


##Results with test and train batch size of 100 each and momentum of 0.9 with weight decay of 1e-4

In [None]:
N_EPOCHS = 5


model = ViTResNet(BasicBlock, [3, 3, 3])
#optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

optimizer = torch.optim.SGD(model.parameters(), lr=0.001,momentum=.9,weight_decay=1e-4)
#lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[35,48],gamma = 0.1)

train_loss_history, test_loss_history = [], []
for epoch in range(1, N_EPOCHS + 1):
    print('Epoch:', epoch)
    start_time = time.time()
    train(model, optimizer, train_loader, train_loss_history)
    print('Execution time:', '{:5.2f}'.format(time.time() - start_time), 'seconds')
    evaluate(model, test_loader, test_loss_history)

print('Execution time')

PATH = ".\ViTRes.pt" # Use your own path
torch.save(model.state_dict(), PATH)

Epoch: 1
Execution time: 585.30 seconds

Average test loss: 2.0975  Accuracy: 1970/10000 (19.70%)

Epoch: 2
Execution time: 583.12 seconds

Average test loss: 1.9872  Accuracy: 2001/10000 (20.01%)

Epoch: 3
Execution time: 576.41 seconds

Average test loss: 1.9416  Accuracy: 2188/10000 (21.88%)

Epoch: 4
Execution time: 570.81 seconds

Average test loss: 1.8914  Accuracy: 2347/10000 (23.47%)

Epoch: 5
Execution time: 595.73 seconds

Average test loss: 1.8840  Accuracy: 2271/10000 (22.71%)

Execution time


## GPU and memory specs

In [None]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         108G   39G   70G  36% /
tmpfs            64M     0   64M   0% /dev
shm             5.8G     0  5.8G   0% /dev/shm
/dev/root       2.0G  1.2G  817M  59% /sbin/docker-init
tmpfs           6.4G   32K  6.4G   1% /var/colab
/dev/sda1        81G   43G   39G  53% /etc/hosts
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware


In [None]:
!cat /proc/cpuinfo


processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 79
model name	: Intel(R) Xeon(R) CPU @ 2.20GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2200.162
cache size	: 56320 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs taa
bogomips	: 4400.32
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 b

In [None]:
!cat /proc/meminfo

MemTotal:       13302920 kB
MemFree:        10713556 kB
MemAvailable:   12380396 kB
Buffers:          105612 kB
Cached:          1683840 kB
SwapCached:            0 kB
Active:          1081128 kB
Inactive:        1325924 kB
Active(anon):     540068 kB
Inactive(anon):      448 kB
Active(file):     541060 kB
Inactive(file):  1325476 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:               768 kB
Writeback:             0 kB
AnonPages:        617660 kB
Mapped:           228764 kB
Shmem:              1176 kB
KReclaimable:      83560 kB
Slab:             124528 kB
SReclaimable:      83560 kB
SUnreclaim:        40968 kB
KernelStack:        4752 kB
PageTables:         9268 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     6651460 kB
Committed_AS:    3065924 kB
VmallocTotal:   34359738367 kB
VmallocUsed:        7104 kB
VmallocChunk:          0 kB
Percpu:          