## Define hyperparameters

In [1]:

# Dataset options
DATASET = 'CIFAR100'  # Options: 'CIFAR10' or 'CIFAR100'
# DATASET = 'CIFAR10'

# Number of classes options
NUM_CLASSES = 20     # Set the number of classes
# NUM_CLASSES = 10   # For example, if using CIFAR-10, set to 10

# Number of attention heads options
NUM_HEADS = 8        # Options: 8, 4, 2, etc.
# NUM_HEADS = 4
# NUM_HEADS = 2

In [2]:
def validate_hyperparameters(dataset_name, num_classes, num_heads):
    """
    Validates the hyperparameters for dataset, number of classes, and number of attention heads.

    Args:
        dataset_name (str): The name of the dataset ('CIFAR10' or 'CIFAR100').
        num_classes (int): The number of classes.
        num_heads (int): The number of attention heads.

    Raises:
        ValueError: If any hyperparameter is invalid.
    """
    valid_datasets = ['CIFAR10', 'CIFAR100']
    if dataset_name not in valid_datasets:
        raise ValueError(f"Invalid DATASET value: {dataset_name}. Choose from {valid_datasets}.")

    if dataset_name == 'CIFAR10' and num_classes != 10:
        raise ValueError(f"For {dataset_name}, NUM_CLASSES must be 10. Current value: {num_classes}.")
    elif dataset_name == 'CIFAR100' and num_classes not in [20, 100]:
        raise ValueError(f"For {dataset_name}, NUM_CLASSES must be 20 or 100. Current value: {num_classes}.")

    valid_heads = [8, 4, 2]
    if num_heads not in valid_heads:
        raise ValueError(f"Invalid NUM_HEADS value: {num_heads}. Choose from {valid_heads}.")

In [3]:

# Validate hyperparameters
validate_hyperparameters(DATASET, NUM_CLASSES, NUM_HEADS)


## Initial Setup

In [4]:
# -*- coding: utf-8 -*-
'''

Train CIFAR10 with PyTorch and Vision Transformers!
written by @kentaroy47, @arutema47
source : https://github.com/kentaroy47/vision-transformers-cifar10

'''

from __future__ import print_function

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import numpy as np

import torchvision
import torchvision.transforms as transforms

import os
import argparse
import pandas as pd
import csv
import time




### Helper functions

#### Saving loading

In [5]:
import torch
import os

def save_model_state(model, epoch, loss, accuracy, checkpoint_dir='checkpoints', log_file='training_log.txt'):
    # Create checkpoint directory if it doesn't exist
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    # Save model state
    model_checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch}.pth')
    torch.save(model.state_dict(), model_checkpoint_path)
    print(f'Model state saved at epoch {epoch}')
    
    # Log accuracy and loss
    log_file_path = os.path.join(checkpoint_dir, log_file)
    
    with open(log_file_path, 'a') as f:
        f.write(f'Epoch {epoch}: Accuracy = {accuracy:.4f}, Loss = {loss:.4f}\n')
    
    print(f'Logged epoch {epoch} - Accuracy: {accuracy:.4f}, Loss: {loss:.4f}')

    
import torch
import os

def load_model_state(model, epoch = 90, checkpoint_dir='checkpoints'):
    model_checkpoint_path = os.path.join(checkpoint_dir, f'model_epoch_{epoch}.pth')
    model.load_state_dict(torch.load(model_checkpoint_path))   
    print(f'Model state loaded from epoch {epoch}')
    return epoch



In [6]:
import os

directory = 'checkpoints'

if os.path.isdir(directory):
    print("Directory exists")
    items = os.listdir(directory)
    for item in items:
        print(item)
else:
    print("Directory does not exist")
    
#     if os.path.isdir(directory):
    


Directory does not exist


#### Remapping labels function

In [7]:
def remap_labels(labels, num_classes_old, num_classes_new):
    """
    Adjusts the labels from an old class structure to a new one.

    Args:
        labels (torch.Tensor or list): Original labels to be adjusted.
        num_classes_old (int): The number of classes in the original dataset.
        num_classes_new (int): The number of classes in the new dataset.

    Returns:
        torch.Tensor or list: The labels adjusted to the new class structure.
    """
    # Check that the number of old classes is divisible by the number of new classes
    assert num_classes_old % num_classes_new == 0, "The number of old classes must be divisible by the number of new classes."

    # Compute the factor to convert old labels to new labels
    factor = num_classes_old // num_classes_new

    # Remap each label
    if isinstance(labels, torch.Tensor):
        # If labels are a tensor, apply the remapping to each label and return a tensor
        remapped_labels = torch.tensor([label.item() // factor for label in labels])
    else:
        # If labels are a list, apply the remapping to each label and return a list
        remapped_labels = [label // factor for label in labels]
    
    return remapped_labels


In [8]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):
    def __init__(self, dataset, num_classes_old, num_classes_new):
        self.dataset = dataset
        self.num_classes_old = num_classes_old
        self.num_classes_new = num_classes_new

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        image, label = self.dataset[index]
        remapped_label = remap_labels(torch.tensor([label]), self.num_classes_old, self.num_classes_new).item()
        return image, remapped_label

In [9]:
run_number = 1
base_dir = "results/runs"

# Define the new run directory

os.makedirs("results", exist_ok=True)
os.makedirs("results/runs", exist_ok=True)

In [10]:
torch.manual_seed(42)

<torch._C.Generator at 0x7ec8117d0470>

In [11]:
# # setup for a read only personal access token
# # note : token expires 19 aug 2025
# token = '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'
# token_user = 'Asterisk07'
# repo_host = 'Asterisk07'
# repo_name = 'BTP-Transformer-explainability'

# url = f'https://{token_user}:{token}@github.com/{repo_host}/{repo_name}/'
# !git clone {url}

# !mv {repo_name}/models .
# !rm -rf BTP-Transformer-explainability # delete a file

In [12]:
!ls

results


In [13]:
# !rm -rf models

In [14]:
# !npm install -g github-files-fetcher

In [15]:
# !fetcher --url=https://github.com/kentaroy47/vision-transformers-cifar10/tree/main/models
# !fetcher --url=https://https://github.com/Asterisk07/BTP-Transformer-explainability/main/models


In [16]:
2

2

In [17]:

import os

# Check if 'utils.py' exists in the current directory
if os.path.exists('utils.py'):
    print("utils.py exists in the current directory.")
else:
    print("utils.py does not exist in the current directory.")
    !wget https://raw.githubusercontent.com/kentaroy47/vision-transformers-cifar10/main/utils.py
    print("utils.py fetched")



utils.py does not exist in the current directory.
--2024-09-08 12:31:21--  https://raw.githubusercontent.com/kentaroy47/vision-transformers-cifar10/main/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3501 (3.4K) [text/plain]
Saving to: 'utils.py'


2024-09-08 12:31:21 (68.9 MB/s) - 'utils.py' saved [3501/3501]

utils.py fetched


In [18]:
from utils import progress_bar

stty: 'standard input': Inappropriate ioctl for device


In [19]:
progress_bar

<function utils.progress_bar(current, total, msg=None)>

In [20]:


# from randomaug import RandAugment
from torchvision.transforms import RandAugment



In [21]:
!pip install einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m671.6 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0


In [22]:
# from models import *
# from models.vit import ViT
# from models.convmixer import ConvMixer

In [23]:

import json

In [24]:

qkv_titles = ['q','k','v']

In [25]:
# https://github.com/lucidrains/vit-pytorch/blob/main/vit_pytorch/vit.py
# VIT.py
import torch
from torch import nn

from einops import rearrange, repeat
from einops.layers.torch import Rearrange
import numpy as np
# helpers

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# classes

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x,save_flag=False, run_dir = None,img_idx = None):
        out =  self.net(x)
        if(save_flag==True):
                file_path = os.path.join(run_dir, 'ff_out.npy')
                # np.save(file_path, out)
                np.save(file_path, out[img_idx].detach().cpu().numpy())
        return out

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()

        inner_dim = dim_head *  heads
        # print("attention : dim = ", dim, "| inner_dim = ",inner_dim,"| dim_head = ", dim_head, "| heads = ",heads  )
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x,save_flag=False, run_dir = None,img_idx = None):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)



        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)

        out = torch.matmul(attn, v)
        if(save_flag==True):

                # Convert each tensor in `qkv` to a numpy array and save it
#                 qkv=attention.to_qkv
                for i, tensor in enumerate((q,k,v)):
                    np_array = tensor[img_idx].detach().cpu().numpy()  # Convert to numpy
                    # np.save(f'qkv_{i}.npy', np_array)  # Save each as a .npy file
                    file_path = os.path.join(run_dir, f'{qkv_titles[i]}.npy')
                    np.save(file_path, np_array)
                file_path = os.path.join(run_dir, 'att_out')
                np.save(file_path, out[img_idx].detach().cpu().numpy())
                file_path = os.path.join(run_dir, 'att_score')
                np.save(file_path,attn[img_idx].detach().cpu().numpy())
        out = rearrange(out, 'b h n d -> b n (h d)')
        # return self.to_out(out),q,k,v
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
        super().__init__()
        # print("transformer : dim = ", dim, "| dim_head = ", dim_head, "| heads = ",heads  )

        self.layers = nn.ModuleList([])
#         self.saved_values = {'logits': [], 'queries': [], 'keys': [], 'values': []}  # To store the values
        # self.saved_values = list()  # To store th
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
            ]))

    def forward(self, x,save_flag=False, run_dir = None, img_idx = None):
        for i, (attn, ff) in enumerate(self.layers):
            # Unpack the output from the Attention layer
            #
            # print("passed trans direcetory ", run_dir, " and saving ",save_flag)
            if save_flag:
              layer_dir = os.path.join(run_dir,  f"layer {i:02}")
              os.makedirs(layer_dir, exist_ok=True)
              # print("passed trans layer direcetory ", layer_dir)
            else:
              layer_dir = None
            attn_out = attn(x,save_flag=save_flag, run_dir = layer_dir, img_idx = img_idx)



            # Save the query, key, value, and logits (output) for this layer
            # self.saved_values.append(q.cpu().detach().numpy())
            # self.saved_values.append(k.cpu().detach().numpy())
            # self.saved_values.append(v.cpu().detach().numpy())

            # Combine the attention output with the original x
            x = attn_out + x
            # self.saved_values.append(x.cpu().detach().numpy())  # Save logits
            # print("i : ",i)
            # Apply the feedforward network
#             x = ff(x) + x

            x = ff(x,save_flag=save_flag, run_dir = layer_dir, img_idx = img_idx) + x

        return x


class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        # print("vit : dim = ", dim, "| dim_head = ", dim_head, "| heads = ",heads , " | mlp = ",mlp_dim )

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img, save_flag=False, run_dir = None,img_idx = None):
        # if (save_flag):
          # print("\n\treached here 3")
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x,save_flag, run_dir, img_idx)
#         if(save_flag==True):
#                 # Convert each tensor in `qkv` to a numpy array and save it
#                 qkv=attention.to_qkv
#                 for i, tensor in enumerate(qkv):
#                     np_array = tensor.detach().cpu().numpy()  # Convert to numpy
#                     np.save(f'qkv_{i}.npy', np_array)  # Save each as a .npy file


        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)

In [26]:
import argparse
import sys

# Define your arguments here
def parse_args():
    # parsers
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
    parser.add_argument('--lr', default=1e-4, type=float, help='learning rate') # resnets.. 1e-3, Vit..1e-4
    parser.add_argument('--opt', default="adam")
    parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint')
    parser.add_argument('--noaug', action='store_false', help='disable use randomaug')
    parser.add_argument('--noamp', action='store_true', help='disable mixed precision training. for older pytorch versions')
    parser.add_argument('--nowandb', action='store_true', help='disable wandb')
    parser.add_argument('--mixup', action='store_true', help='add mixup augumentations')
    parser.add_argument('--net', default='vit')
    parser.add_argument('--dp', action='store_true', help='use data parallel')
    parser.add_argument('--bs', default='512')
    parser.add_argument('--size', default="32")
    parser.add_argument('--n_epochs', type=int, default='200')
    parser.add_argument('--patch', default='4', type=int, help="patch for ViT")
    parser.add_argument('--dimhead', default="512", type=int)
    parser.add_argument('--convkernel', default='8', type=int, help="parameter for convmixer")

    return parser.parse_args()




In [27]:
command = 'python train_cifar10.py --n_epochs 500 --lr 0.0005'
command.split()[1:]

['train_cifar10.py', '--n_epochs', '500', '--lr', '0.0005']

In [28]:
# Simulate command-line arguments
# sys.argv = ['your_script.py', '--lr', '0.2', '--opt', 'adam', '--net', 'vit', '--bs', '64','--dimhead','256']
sys.argv = command.split()[1:]

args = parse_args()



In [29]:
# !pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu118 --upgrade --force-reinstall

In [30]:
# (2.0.1+cu117)
# Requirement already satisfied: torchvision in /opt/conda/lib/python3.10/site-packages (0.15.2+cu117)

In [31]:
# !pip show torchvision


In [32]:
2

2

In [33]:
# !pip show torch
# #

In [34]:
import torchvision
torchvision.__version__

'0.19.0'

In [35]:
import torch
torch.__version__

'2.4.0'

In [36]:
!pip install wandb



In [37]:

# take in args
usewandb = ~args.nowandb
usewandb = False
if usewandb:
    import wandb
    watermark = "{}_lr{}".format(args.net, args.lr)
    wandb.init(project="cifar10-challange",
            name=watermark)
    wandb.config.update(args)

bs = int(args.bs)
imsize = int(args.size)

use_amp = not args.noamp
aug = args.noaug

device = 'cuda' if torch.cuda.is_available() else 'cpu'
best_acc = 0  # best test accuracy
start_epoch = 0  # start from epoch 0 or last checkpoint epoch

# Data
print('==> Preparing data..')
if args.net=="vit_timm":
    size = 384
else:
    size = imsize

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.Resize(size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.Resize(size),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# Add RandAugment with N, M(hyperparameter)
if aug:
    N = 2; M = 14;
    transform_train.transforms.insert(0, RandAugment(N, M))
    
print("ready to fetch data")

==> Preparing data..
ready to fetch data


### loading cifar 100

In [38]:
bs

512

In [39]:
if DATASET == 'CIFAR10':
    datasetname = 'cifar10-python'
    datasetcode = 'pankrzysiu/cifar10-python'
elif DATASET == 'CIFAR100':
    datasetname = 'cifar100'
    datasetcode = 'fedesoriano/cifar100'
!kaggle datasets download -d {datasetcode}

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/cifar100
License(s): copyright-authors
Downloading cifar100.zip to /kaggle/working
 95%|███████████████████████████████████████  | 153M/161M [00:00<00:00, 274MB/s]
100%|█████████████████████████████████████████| 161M/161M [00:00<00:00, 282MB/s]


In [40]:
# !rm -rf dataset

In [41]:
import zipfile
import os

# Unzip the dataset
with zipfile.ZipFile(f'{datasetname}.zip', 'r') as zip_ref:
    zip_ref.extractall('/kaggle/working/dataset')


In [42]:
class CIFAR10Custom(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

        # CIFAR-10 images have 3 channels (RGB) and size 32x32
        self.img_channels = 3
        self.img_height = 32
        self.img_width = 32

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img = self.data[idx]
        # Reshape the flattened image (3072) into (3, 32, 32) format
        img = img.reshape(self.img_channels, self.img_height, self.img_width)

        # Convert to PIL Image format (HxWxC)
        img = Image.fromarray(img.transpose(1, 2, 0), 'RGB')

        label = self.labels[idx]

        if self.transform:
            img = self.transform(img)

        return img, label


def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


def load_cifar10_data(data_path):
    """ Load all CIFAR-10 data batches and return combined data and labels """
    train_data = []
    train_labels = []

    # CIFAR-10 contains 5 training batches
    for i in range(1, 6):
        batch = unpickle(f"{data_path}/data_batch_{i}")
        train_data.append(batch[b'data'])
        train_labels.extend(batch[b'labels'])

    # Convert to numpy arrays
    train_data = np.concatenate(train_data, axis=0)
    train_labels = np.array(train_labels)

    # Load test data
    test_batch = unpickle(f"{data_path}/test_batch")
    test_data = test_batch[b'data']
    test_labels = np.array(test_batch[b'labels'])

    return (train_data, train_labels), (test_data, test_labels)





In [44]:
from torchvision import transforms
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class CIFAR100Custom(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

        # Determine image dimensions from the first sample
        sample_img = self.data[0]  # Get the first image sample
        self.img_channels = 3  # CIFAR-100 images are RGB

        # The image is flattened, so we need to infer the correct shape.
        self.img_height, self.img_width = self._get_image_dimensions(sample_img)

    def _get_image_dimensions(self, img_flat):
        # Assuming CIFAR-100 with known number of channels (3)
        num_pixels = img_flat.size // self.img_channels
        side = int(np.sqrt(num_pixels))  # CIFAR-100 images are square (32x32)
        return side, side

    def _reshape_image(self, img_flat):
        # Try reshaping based on typical formats CxHxW or HxWxC
        try:
            img = img_flat.reshape(self.img_channels, self.img_height, self.img_width)  # CxHxW
        except ValueError:
            img = img_flat.reshape(self.img_height, self.img_width, self.img_channels)  # HxWxC
        return img

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img = self.data[idx]
        img = self._reshape_image(img)

        # Handle case where image is in HxWxC (common in numpy arrays)
        if img.shape[0] == self.img_height:  # HxWxC format
            img = img.transpose(2, 0, 1)  # Convert to CxHxW

        # Convert numpy array to PIL image for transformations
        img = Image.fromarray(img.transpose(1, 2, 0), 'RGB')  # Convert CxHxW to HxWxC for PIL

        label = self.labels[idx]

        if self.transform:
            img = self.transform(img)

        return img, label


In [45]:
import pickle

In [46]:
if DATASET == 'CIFAR10':
    
    # Load CIFAR-10 data
    data_path = '/kaggle/working/dataset/cifar-10-batches-py'  # Update path as necessary
    (train_data, train_labels), (test_data, test_labels) = load_cifar10_data(data_path)

    # Prepare datasets
    trainset = CIFAR10Custom(data=train_data, labels=train_labels, transform=transform_train)
    testset = CIFAR10Custom(data=test_data, labels=test_labels, transform=transform_test)

    
elif DATASET == 'CIFAR100':
    

    # Prepare dataset
    data_pre_path = '/kaggle/working/dataset'
    metadata_path = data_pre_path + '/meta'
    data_train_path = data_pre_path + '/train'
    data_test_path = data_pre_path + '/test'

    def unpickle(file):
        import pickle
        with open(file, 'rb') as fo:
            data_dict = pickle.load(fo, encoding='bytes')
        return data_dict

    metadata = unpickle(metadata_path)
    data_train_dict = unpickle(data_train_path)
    data_test_dict = unpickle(data_test_path)

    data_train = data_train_dict[b'data']
    label_train = np.array(data_train_dict[b'coarse_labels'])
    data_test = data_test_dict[b'data']
    label_test = np.array(data_test_dict[b'coarse_labels'])

    # Prepare datasets
    trainset = CIFAR100Custom(data=data_train, labels=label_train, transform=transform_train)
    testset = CIFAR100Custom(data=data_test, labels=label_test, transform=transform_test)

# DataLoader
batch_size = bs  # Adjust batch size as needed
num_workers = 4  # Adjust number of workers as needed

trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

print("DataLoader prepared.")


DataLoader prepared.


In [48]:
# r1 is None

In [49]:
i1, l1  = None, None

In [50]:
for images, labels in trainloader:
    # Get the first image and label from the batch
    print(1)
    i1 = images
    l1 = labels
    break

1


In [51]:
i1.shape

torch.Size([512, 3, 32, 32])

In [52]:
# import torch
# import matplotlib.pyplot as plt

# # Assuming you have a tensor of shape [3, 32, 32]
# # Create a random tensor for demonstration purposes
# image_tensor = i1[15]

# # Convert the tensor from [C, H, W] to [H, W, C] for displaying
# image_np = image_tensor.permute(1, 2, 0).numpy()

# # Plot the image
# plt.imshow(image_np)
# plt.axis('off')  # Turn off the axis labels
# plt.show()


In [53]:
# # Prepare dataset
# print("Loading dataset from kaggle/input")
# trainset = CIFAR100Custom(root_dir='/kaggle/input/cifar100/train', transform=transform_train)
# testset = CIFAR100Custom(root_dir='/kaggle/input/cifar100/test', transform=transform_test)

# print("Loading dataset")
# trainloader = DataLoader(trainset, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS)
# testloader = DataLoader(testset, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS)

In [54]:
# len(trainset)

In [55]:
# NUM_WORKERS = 4
# cifar100_path = '/kaggle/input/cifar100'
# # Prepare dataset

# # trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)

# # testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
# import torchvision
# import torch
# import torch.nn as nnn



# # Prepare dataset based on hyperparameter
# print("downloading dataset")
# if DATASET == 'CIFAR10':
#     trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
#     print("downloaded dataset")
#     testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
    
# elif DATASET == 'CIFAR100':
#     if NUM_CLASSES%20 != 0 :
#         raise ValueError("Invalid value of NUM_CLASSES specified. Choose 20 or 100")
#     trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
#     print("downloaded dataset")
#     testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
# else:
#     raise ValueError("Invalid dataset specified. Choose 'CIFAR10' or 'CIFAR100'.")
    
# print("loading dataset")
# trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS)
# testloader = torch.utils.data.DataLoader(testset, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS)

In [56]:
# use only this token :
# f439c9e9cdf4ff7e3d47e80d4588628783d8bafe #aster

### Remapping labels if needed

In [57]:
# if NUM_CLASSES == 20:
#     # Create custom dataset class to remap labels


#     # Create custom datasets with remapped labels
#     trainset = CustomDataset(trainset, num_classes_old=100, num_classes_new=NUM_CLASSES)
#     testset = CustomDataset(testset, num_classes_old=100, num_classes_new=NUM_CLASSES)


In [58]:
# trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True, num_workers=NUM_WORKERS)
# testloader = torch.utils.data.DataLoader(testset, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS)

In [59]:
# For Multi-GPU
if 'cuda' in device:
    print(device)
    if args.dp:
        print("using data parallel")
        net = torch.nn.DataParallel(net) # make parallel
        cudnn.benchmark = True


cuda


In [60]:
!rm -rf results

In [61]:

def get_vit():
    return ViT(
    image_size = size,
    patch_size = args.patch,
    # num_classes = 10,
    num_classes = NUM_CLASSES,
    dim = int(args.dimhead),
    depth = 6,
    # heads = 8,
    heads = NUM_HEADS,
    # mlp_dim = 512,
    mlp_dim = 256,
    dropout = 0.1,
    emb_dropout = 0.1,
    )

In [62]:


classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Model factory..
print('==> Building model..')
# net = VGG('VGG19')
if args.net=="vit":
    # ViT for cifar10
    net = get_vit()



if args.resume:
    # Load checkpoint.
    print('==> Resuming from checkpoint..')
    assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!'
    checkpoint = torch.load('./checkpoint/{}-ckpt.t7'.format(args.net))
    net.load_state_dict(checkpoint['net'])
    best_acc = checkpoint['acc']
    start_epoch = checkpoint['epoch']

==> Building model..


In [63]:
from tqdm import tqdm

In [64]:

len(trainloader)

98

In [65]:

# trainloader[0]

In [66]:
# MAX_EPOCHS = 90
MAX_EPOCHS = 200

In [67]:
import numpy as np

# Loss is CE
criterion = nn.CrossEntropyLoss()

torch.manual_seed(42)
net = get_vit()

if args.opt == "adam":
    optimizer = optim.Adam(net.parameters(), lr=args.lr)
elif args.opt == "sgd":
    optimizer = optim.SGD(net.parameters(), lr=args.lr)

# use cosine scheduling
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.n_epochs)

##### Training
scaler = torch.amp.GradScaler('cuda',enabled=use_amp)
def train(epoch,save_flag, run_dir = None, img_idx = None):
    
    net.train()
    train_loss = 0
    correct = 0
    total = 0

    # img_factor = len(trainloader) // img_save_count
    # run_dir = os.path.join(run_dir,  {epoch:02}")

    # data_save=list()
    # main_list=list()
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        # Train with amp
        with torch.amp.autocast('cuda',enabled=use_amp):
            # if(save_flag==True and batch_idx%img_factor==0):

            if(save_flag==True and batch_idx==0):
                batch_dir = os.path.join(run_dir, f'batch {batch_idx}')
                os.makedirs(batch_dir, exist_ok=True)
                # np.save(file_path, np_array)
                # print("\n\tpassed ",batch_dir, type(batch_dir))

                outputs = net(inputs, True, batch_dir, img_idx)
                # outputs = net(inputs, False, 12)
                #here can pass in net(inputs,image_saveflag=1) so it will save the image to disk by making changes in model.
            else:
                outputs = net(inputs)
            loss = criterion(outputs, targets)



        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    # data_save.append(net.transformer.saved_values)

#         progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
#             % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))
#     return train_loss/(batch_idx+1),net.transformer.saved_values
    return train_loss/(batch_idx+1)
##### Validation
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

#             progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
#                 % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # Save checkpoint.
    acc = 100.*correct/total
#     if acc > best_acc:
#         print('Saving..')
#         state = {"model": net.state_dict(),
#               "optimizer": optimizer.state_dict(),
#               "scaler": scaler.state_dict()}
#         if not os.path.isdir('checkpoint'):
#             os.mkdir('checkpoint')
#         torch.save(state, './checkpoint/'+args.net+'-{}-ckpt.t7'.format(args.patch))
#         best_acc = acc

#     os.makedirs("log", exist_ok=True)

    os.makedirs("results", exist_ok=True)
    os.makedirs("results/log", exist_ok=True)
    content = f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, val loss: {test_loss:.5f}, acc: {(acc):.5f}'
    print(content)
#     with open(f'log/log_{args.net}_patch{args.patch}.txt', 'a') as appender:
#         appender.write(content + "\n")
    return test_loss, acc

list_loss = []
list_acc = []

if usewandb:
    wandb.watch(net)

    

# save_epochs-=1
batch_size = int(args.bs)
# max_epochs = args.n_epochs




if device == 'cuda':
  net.cuda()
main_list=list()
data_save=list()
n_param=5

run_dir = os.path.join(base_dir, f"run {run_number:02}")
os.makedirs(run_dir, exist_ok=True)
print("Run number ",run_number)






import shutil
from IPython.display import FileLink

# Specify the directory you want to compress
directory_name = run_dir
zip_filename = f'{run_dir}.zip'




run_number += 1

max_epochs = MAX_EPOCHS

# take_epoch_factor = 
img_save_count = 50 #IMP

img_idx = torch.randperm(batch_size)[:img_save_count]
img_idx= img_idx.sort()[0]
print("chosen images are of batch 0 and numbers : ",[x.item() for x in list(img_idx)])
file_path = os.path.join(run_dir, 'img_idx.npy')
np.save(file_path, img_idx.detach().cpu().numpy())



# save_epochs =  #IMP
# epoch_factor = max_epochs  // save_epochs #IMP

epoch_factor = 20 #IMP
patience_param=500
patience_counter=0


print(f"Saving results every {epoch_factor} epochs ")




print("Training started")
for i in tqdm(range(start_epoch, max_epochs), desc="Training"):
    epoch = i+1
    start = time.time()
    
#     if(epoch%epoch_factor==0 or epoch == 1 or epoch == max_epochs):
    if False:
      # Define the new run directory
        
        epoch_dir = os.path.join(run_dir, f"epoch {epoch:02}")
        # print("\n\tpassed into trainloss",run_dir)
        trainloss = train(epoch,True, run_dir = epoch_dir, img_idx = img_idx)
        print("saved epoch")
        # Compress the directory into a zip file, overwriting if it already exists
        shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', directory_name)

#         print(f"Directory '{directory_name}' has been zipped as '{zip_filename}'.")
        print("Click here to download run  : ")
        display(FileLink(zip_filename))
    
    else:
        trainloss = train(epoch,False)



#     if(epoch%n_param!=0 or epoch==0):
#         data_save.append(saved_data)
#     else:
#         data_save.append(saved_data)
#         main_list.append(data_save)
#         data_save=list()
#     val_loss, acc = test(epoch)

    scheduler.step() # step cosine scheduling


    '''
    # Early stopping logic
    best_loss = float('inf')  # Initialize best loss to a high value
    val_loss, acc = test(epoch)
    
    if val_loss < best_loss:
        best_loss = val_loss
        patience_counter = 0  # Reset patience counter if loss improves

    else:
        patience_counter += 1  # Increment patience counter if no improvement

    # If patience limit is exceeded, stop training
    if patience_counter >= patience_param:
        save_model_state(net, epoch , val_loss, acc)
        print(f'Early stopping at epoch {epoch} due to no improvement')

        break  ## we break out of training loop after saving the model
     '''
    val_loss, acc = test(epoch)
    if (epoch%epoch_factor==0 or epoch == 1 or epoch == max_epochs and MODEL_SAVE_FLAG):
        
        
#         val_loss, acc = test(epoch)
        save_model_state(net, epoch , val_loss, acc)
        

#     list_loss.append(val_loss)
#     list_acc.append(acc)

    # Log training..
#     if usewandb:
#         wandb.log({'epoch': epoch, 'train_loss': trainloss, 'val_loss': val_loss, "val_acc": acc, "lr": optimizer.param_groups[0]["lr"],
#         "epoch_time": time.time()-start})

#     # Write out csv..
#     with open(f'log/log_{args.net}_patch{args.patch}.csv', 'w') as f:
#         writer = csv.writer(f, lineterminator='\n')
#         writer.writerow(list_loss)
#         writer.writerow(list_acc)
# #     print(list_loss)
    print()
    
# writeout wandb
if usewandb:
    wandb.save("wandb_{}.h5".format(args.net))


Run number  1
chosen images are of batch 0 and numbers :  [7, 31, 33, 38, 45, 57, 58, 59, 60, 98, 118, 126, 131, 135, 139, 141, 142, 143, 147, 155, 162, 184, 209, 219, 233, 245, 252, 280, 286, 296, 310, 327, 349, 351, 357, 365, 368, 399, 411, 422, 424, 425, 431, 442, 452, 457, 463, 481, 482, 502]
Saving results every 20 epochs 
Training started


Training:   0%|          | 1/200 [00:30<1:40:23, 30.27s/it]

Epoch 1, lr: 0.0005000, val loss: 46.50945, acc: 27.61000
Model state saved at epoch 1
Logged epoch 1 - Accuracy: 27.6100, Loss: 46.5094



Training:   1%|          | 2/200 [00:56<1:31:05, 27.60s/it]




Training:   2%|▏         | 3/200 [01:22<1:28:50, 27.06s/it]




Training:   2%|▏         | 4/200 [01:48<1:27:31, 26.79s/it]




Training:   2%|▎         | 5/200 [02:14<1:26:10, 26.51s/it]




Training:   3%|▎         | 6/200 [02:40<1:25:19, 26.39s/it]




Training:   4%|▎         | 7/200 [03:07<1:24:43, 26.34s/it]




Training:   4%|▍         | 8/200 [03:33<1:23:59, 26.25s/it]




Training:   4%|▍         | 9/200 [03:59<1:23:29, 26.23s/it]




Training:   5%|▌         | 10/200 [04:25<1:23:11, 26.27s/it]




Training:   6%|▌         | 11/200 [04:52<1:22:46, 26.28s/it]




Training:   6%|▌         | 12/200 [05:18<1:22:18, 26.27s/it]




Training:   6%|▋         | 13/200 [05:44<1:21:49, 26.25s/it]




Training:   7%|▋         | 14/200 [06:10<1:21:13, 26.20s/it]




Training:   8%|▊         | 15/200 [06:37<1:21:04, 26.29s/it]




Training:   8%|▊         | 16/200 [07:03<1:20:39, 26.30s/it]




Training:   8%|▊         | 17/200 [07:29<1:20:09, 26.28s/it]




Training:   9%|▉         | 18/200 [07:55<1:19:42, 26.28s/it]




Training:  10%|▉         | 19/200 [08:22<1:19:07, 26.23s/it]




Training:  10%|█         | 20/200 [08:52<1:22:51, 27.62s/it]

Epoch 20, lr: 0.0004980, val loss: 30.43110, acc: 52.60000
Model state saved at epoch 20
Logged epoch 20 - Accuracy: 52.6000, Loss: 30.4311



Training:  10%|█         | 21/200 [09:19<1:21:05, 27.18s/it]




Training:  11%|█         | 22/200 [09:45<1:19:52, 26.92s/it]




Training:  12%|█▏        | 23/200 [10:11<1:18:48, 26.72s/it]




Training:  12%|█▏        | 24/200 [10:37<1:17:56, 26.57s/it]




Training:  12%|█▎        | 25/200 [11:04<1:17:12, 26.47s/it]




Training:  13%|█▎        | 26/200 [11:30<1:16:30, 26.38s/it]




Training:  14%|█▎        | 27/200 [11:56<1:16:02, 26.37s/it]




Training:  14%|█▍        | 28/200 [12:22<1:15:31, 26.34s/it]




Training:  14%|█▍        | 29/200 [12:49<1:15:01, 26.32s/it]




Training:  15%|█▌        | 30/200 [13:15<1:14:30, 26.30s/it]




Training:  16%|█▌        | 31/200 [13:41<1:13:59, 26.27s/it]




Training:  16%|█▌        | 32/200 [14:07<1:13:33, 26.27s/it]




Training:  16%|█▋        | 33/200 [14:34<1:13:12, 26.30s/it]




Training:  17%|█▋        | 34/200 [15:00<1:12:39, 26.26s/it]




Training:  18%|█▊        | 35/200 [15:26<1:12:10, 26.24s/it]




Training:  18%|█▊        | 36/200 [15:52<1:11:46, 26.26s/it]




Training:  18%|█▊        | 37/200 [16:19<1:11:14, 26.23s/it]




Training:  19%|█▉        | 38/200 [16:45<1:10:52, 26.25s/it]




Training:  20%|█▉        | 39/200 [17:11<1:10:23, 26.23s/it]




Training:  20%|██        | 40/200 [17:42<1:13:42, 27.64s/it]

Epoch 40, lr: 0.0004921, val loss: 23.92619, acc: 62.25000
Model state saved at epoch 40
Logged epoch 40 - Accuracy: 62.2500, Loss: 23.9262



Training:  20%|██        | 41/200 [18:08<1:12:04, 27.20s/it]




Training:  21%|██        | 42/200 [18:34<1:10:49, 26.90s/it]




Training:  22%|██▏       | 43/200 [19:01<1:09:47, 26.67s/it]




Training:  22%|██▏       | 44/200 [19:27<1:08:56, 26.52s/it]




Training:  22%|██▎       | 45/200 [19:53<1:08:17, 26.43s/it]




Training:  23%|██▎       | 46/200 [20:19<1:07:39, 26.36s/it]




Training:  24%|██▎       | 47/200 [20:45<1:07:08, 26.33s/it]




Training:  24%|██▍       | 48/200 [21:11<1:06:32, 26.27s/it]




Training:  24%|██▍       | 49/200 [21:38<1:06:13, 26.32s/it]




Training:  25%|██▌       | 50/200 [22:04<1:05:47, 26.31s/it]




Training:  26%|██▌       | 51/200 [22:30<1:05:17, 26.29s/it]




Training:  26%|██▌       | 52/200 [22:57<1:04:46, 26.26s/it]




Training:  26%|██▋       | 53/200 [23:23<1:04:14, 26.22s/it]




Training:  27%|██▋       | 54/200 [23:49<1:03:47, 26.21s/it]




Training:  28%|██▊       | 55/200 [24:15<1:03:19, 26.20s/it]




Training:  28%|██▊       | 56/200 [24:41<1:02:50, 26.18s/it]




Training:  28%|██▊       | 57/200 [25:08<1:02:29, 26.22s/it]




Training:  29%|██▉       | 58/200 [25:34<1:02:04, 26.23s/it]




Training:  30%|██▉       | 59/200 [26:00<1:01:35, 26.21s/it]




Training:  30%|███       | 60/200 [26:31<1:04:38, 27.70s/it]

Epoch 60, lr: 0.0004824, val loss: 20.96637, acc: 67.77000
Model state saved at epoch 60
Logged epoch 60 - Accuracy: 67.7700, Loss: 20.9664



Training:  30%|███       | 61/200 [26:58<1:03:14, 27.30s/it]




Training:  31%|███       | 62/200 [27:24<1:02:04, 26.99s/it]




Training:  32%|███▏      | 63/200 [27:50<1:01:04, 26.75s/it]




Training:  32%|███▏      | 64/200 [28:16<1:00:15, 26.59s/it]




Training:  32%|███▎      | 65/200 [28:42<59:30, 26.44s/it]  




Training:  33%|███▎      | 66/200 [29:09<58:55, 26.38s/it]




Training:  34%|███▎      | 67/200 [29:35<58:21, 26.32s/it]




Training:  34%|███▍      | 68/200 [30:01<57:59, 26.36s/it]




Training:  34%|███▍      | 69/200 [30:27<57:27, 26.32s/it]




Training:  35%|███▌      | 70/200 [30:54<56:55, 26.28s/it]




Training:  36%|███▌      | 71/200 [31:20<56:22, 26.22s/it]




Training:  36%|███▌      | 72/200 [31:46<55:57, 26.23s/it]




Training:  36%|███▋      | 73/200 [32:12<55:28, 26.21s/it]




Training:  37%|███▋      | 74/200 [32:38<55:05, 26.24s/it]




Training:  38%|███▊      | 75/200 [33:05<54:37, 26.22s/it]




Training:  38%|███▊      | 76/200 [33:31<54:10, 26.21s/it]




Training:  38%|███▊      | 77/200 [33:57<53:55, 26.31s/it]




Training:  39%|███▉      | 78/200 [34:24<53:28, 26.30s/it]




Training:  40%|███▉      | 79/200 [34:50<52:58, 26.27s/it]




Training:  40%|████      | 80/200 [35:21<55:21, 27.68s/it]

Epoch 80, lr: 0.0004691, val loss: 19.65438, acc: 70.18000
Model state saved at epoch 80
Logged epoch 80 - Accuracy: 70.1800, Loss: 19.6544



Training:  40%|████      | 81/200 [35:47<54:01, 27.24s/it]




Training:  41%|████      | 82/200 [36:13<52:57, 26.93s/it]




Training:  42%|████▏     | 83/200 [36:39<52:05, 26.71s/it]




Training:  42%|████▏     | 84/200 [37:06<51:23, 26.58s/it]




Training:  42%|████▎     | 85/200 [37:32<50:44, 26.47s/it]




Training:  43%|████▎     | 86/200 [37:58<50:09, 26.40s/it]




Training:  44%|████▎     | 87/200 [38:24<49:37, 26.35s/it]




Training:  44%|████▍     | 88/200 [38:51<49:04, 26.29s/it]




Training:  44%|████▍     | 89/200 [39:17<48:37, 26.29s/it]




Training:  45%|████▌     | 90/200 [39:43<48:08, 26.26s/it]




Training:  46%|████▌     | 91/200 [40:09<47:37, 26.22s/it]




Training:  46%|████▌     | 92/200 [40:35<47:11, 26.22s/it]




Training:  46%|████▋     | 93/200 [41:01<46:41, 26.18s/it]




Training:  47%|████▋     | 94/200 [41:28<46:17, 26.21s/it]




Training:  48%|████▊     | 95/200 [41:54<45:52, 26.21s/it]




Training:  48%|████▊     | 96/200 [42:20<45:24, 26.20s/it]




Training:  48%|████▊     | 97/200 [42:46<44:59, 26.21s/it]




Training:  49%|████▉     | 98/200 [43:12<44:31, 26.19s/it]




Training:  50%|████▉     | 99/200 [43:39<44:03, 26.17s/it]




Training:  50%|█████     | 100/200 [44:10<46:02, 27.63s/it]

Epoch 100, lr: 0.0004523, val loss: 20.00005, acc: 71.48000
Model state saved at epoch 100
Logged epoch 100 - Accuracy: 71.4800, Loss: 20.0001



Training:  50%|█████     | 101/200 [44:36<44:55, 27.23s/it]




Training:  51%|█████     | 102/200 [45:02<43:56, 26.91s/it]




Training:  52%|█████▏    | 103/200 [45:28<43:08, 26.69s/it]




Training:  52%|█████▏    | 104/200 [45:55<42:31, 26.58s/it]




Training:  52%|█████▎    | 105/200 [46:21<41:51, 26.44s/it]




Training:  53%|█████▎    | 106/200 [46:47<41:21, 26.40s/it]




Training:  54%|█████▎    | 107/200 [47:13<40:49, 26.34s/it]




Training:  54%|█████▍    | 108/200 [47:39<40:17, 26.27s/it]




Training:  55%|█████▍    | 109/200 [48:06<39:50, 26.27s/it]




Training:  55%|█████▌    | 110/200 [48:32<39:20, 26.23s/it]




Training:  56%|█████▌    | 111/200 [48:58<38:54, 26.23s/it]




Training:  56%|█████▌    | 112/200 [49:24<38:25, 26.20s/it]




Training:  56%|█████▋    | 113/200 [49:50<38:00, 26.21s/it]




Training:  57%|█████▋    | 114/200 [50:17<37:33, 26.20s/it]




Training:  57%|█████▊    | 115/200 [50:43<37:07, 26.21s/it]




Training:  58%|█████▊    | 116/200 [51:09<36:39, 26.18s/it]




Training:  58%|█████▊    | 117/200 [51:35<36:15, 26.21s/it]




Training:  59%|█████▉    | 118/200 [52:01<35:47, 26.19s/it]




Training:  60%|█████▉    | 119/200 [52:27<35:20, 26.18s/it]




Training:  60%|██████    | 120/200 [52:58<36:49, 27.62s/it]

Epoch 120, lr: 0.0004322, val loss: 20.60483, acc: 71.60000
Model state saved at epoch 120
Logged epoch 120 - Accuracy: 71.6000, Loss: 20.6048



Training:  60%|██████    | 121/200 [53:24<35:45, 27.16s/it]




Training:  61%|██████    | 122/200 [53:51<34:52, 26.83s/it]




Training:  62%|██████▏   | 123/200 [54:17<34:12, 26.65s/it]




Training:  62%|██████▏   | 124/200 [54:43<33:35, 26.52s/it]




Training:  62%|██████▎   | 125/200 [55:09<33:00, 26.41s/it]




Training:  63%|██████▎   | 126/200 [55:35<32:30, 26.35s/it]




Training:  64%|██████▎   | 127/200 [56:01<31:58, 26.28s/it]




Training:  64%|██████▍   | 128/200 [56:28<31:37, 26.35s/it]




Training:  64%|██████▍   | 129/200 [56:54<31:08, 26.31s/it]




Training:  65%|██████▌   | 130/200 [57:20<30:39, 26.27s/it]




Training:  66%|██████▌   | 131/200 [57:47<30:11, 26.26s/it]




Training:  66%|██████▌   | 132/200 [58:13<29:44, 26.24s/it]




Training:  66%|██████▋   | 133/200 [58:39<29:15, 26.20s/it]




Training:  67%|██████▋   | 134/200 [59:05<28:52, 26.25s/it]




Training:  68%|██████▊   | 135/200 [59:32<28:26, 26.25s/it]




Training:  68%|██████▊   | 136/200 [59:58<27:59, 26.24s/it]




Training:  68%|██████▊   | 137/200 [1:00:24<27:31, 26.21s/it]




Training:  69%|██████▉   | 138/200 [1:00:50<27:04, 26.20s/it]




Training:  70%|██████▉   | 139/200 [1:01:16<26:40, 26.23s/it]




Training:  70%|███████   | 140/200 [1:01:47<27:37, 27.63s/it]

Epoch 140, lr: 0.0004094, val loss: 21.88950, acc: 72.16000
Model state saved at epoch 140
Logged epoch 140 - Accuracy: 72.1600, Loss: 21.8895



Training:  70%|███████   | 141/200 [1:02:14<26:45, 27.21s/it]




Training:  71%|███████   | 142/200 [1:02:40<25:59, 26.89s/it]




Training:  72%|███████▏  | 143/200 [1:03:06<25:20, 26.67s/it]




Training:  72%|███████▏  | 144/200 [1:03:32<24:46, 26.54s/it]




Training:  72%|███████▎  | 145/200 [1:03:58<24:17, 26.51s/it]




Training:  73%|███████▎  | 146/200 [1:04:25<23:45, 26.39s/it]




Training:  74%|███████▎  | 147/200 [1:04:51<23:17, 26.37s/it]




Training:  74%|███████▍  | 148/200 [1:05:17<22:47, 26.29s/it]




Training:  74%|███████▍  | 149/200 [1:05:43<22:19, 26.26s/it]




Training:  75%|███████▌  | 150/200 [1:06:09<21:50, 26.20s/it]




Training:  76%|███████▌  | 151/200 [1:06:36<21:24, 26.21s/it]




Training:  76%|███████▌  | 152/200 [1:07:02<20:57, 26.19s/it]




Training:  76%|███████▋  | 153/200 [1:07:28<20:30, 26.19s/it]




Training:  77%|███████▋  | 154/200 [1:07:54<20:05, 26.20s/it]




Training:  78%|███████▊  | 155/200 [1:08:20<19:37, 26.16s/it]




Training:  78%|███████▊  | 156/200 [1:08:47<19:14, 26.23s/it]




Training:  78%|███████▊  | 157/200 [1:09:13<18:48, 26.23s/it]




Training:  79%|███████▉  | 158/200 [1:09:39<18:20, 26.21s/it]




Training:  80%|███████▉  | 159/200 [1:10:05<17:54, 26.20s/it]




Training:  80%|████████  | 160/200 [1:10:36<18:23, 27.59s/it]

Epoch 160, lr: 0.0003840, val loss: 22.02450, acc: 72.81000
Model state saved at epoch 160
Logged epoch 160 - Accuracy: 72.8100, Loss: 22.0245



Training:  80%|████████  | 161/200 [1:11:02<17:37, 27.13s/it]




Training:  81%|████████  | 162/200 [1:11:28<17:02, 26.89s/it]




Training:  82%|████████▏ | 163/200 [1:11:55<16:27, 26.68s/it]




Training:  82%|████████▏ | 164/200 [1:12:21<15:54, 26.51s/it]




Training:  82%|████████▎ | 165/200 [1:12:47<15:23, 26.38s/it]




Training:  83%|████████▎ | 166/200 [1:13:13<14:54, 26.31s/it]




Training:  84%|████████▎ | 167/200 [1:13:39<14:26, 26.25s/it]




Training:  84%|████████▍ | 168/200 [1:14:05<13:59, 26.25s/it]




Training:  84%|████████▍ | 169/200 [1:14:31<13:33, 26.23s/it]




Training:  85%|████████▌ | 170/200 [1:14:58<13:06, 26.22s/it]




Training:  86%|████████▌ | 171/200 [1:15:24<12:40, 26.21s/it]




Training:  86%|████████▌ | 172/200 [1:15:50<12:12, 26.17s/it]




Training:  86%|████████▋ | 173/200 [1:16:16<11:49, 26.29s/it]




Training:  87%|████████▋ | 174/200 [1:16:43<11:22, 26.25s/it]




Training:  88%|████████▊ | 175/200 [1:17:09<10:55, 26.23s/it]




Training:  88%|████████▊ | 176/200 [1:17:35<10:28, 26.20s/it]




Training:  88%|████████▊ | 177/200 [1:18:01<10:01, 26.17s/it]




Training:  89%|████████▉ | 178/200 [1:18:27<09:35, 26.16s/it]




Training:  90%|████████▉ | 179/200 [1:18:53<09:09, 26.18s/it]




Training:  90%|█████████ | 180/200 [1:19:24<09:12, 27.62s/it]

Epoch 180, lr: 0.0003564, val loss: 24.14222, acc: 72.07000
Model state saved at epoch 180
Logged epoch 180 - Accuracy: 72.0700, Loss: 24.1422



Training:  90%|█████████ | 180/200 [1:19:33<08:50, 26.52s/it]


KeyboardInterrupt: 

In [68]:

import shutil
from IPython.display import FileLink
zip_filename = 'checkpoints.zip'
directory_name = 'checkpoints'
shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', directory_name)

#         print(f"Directory '{directory_name}' has been zipped as '{zip_filename}'.")
print("Click here to download checkpoints  : ")
display(FileLink(zip_filename))

Click here to download checkpoints  : 


In [69]:
import os

# Specify the path to your file
file_path = 'checkpoints.zip'

# Get the size of the file in bytes
file_size = os.path.getsize(file_path)

print(f"The size of the file is : {(file_size/(2**20)):.0f} MB")


The size of the file is : 281 MB


In [71]:
# !rm -rf checkpoints.zip

In [None]:
raise ZeroDivisionError

In [None]:
!cd data
!ls


In [None]:
!ls

In [None]:
# !rm -rf results
# !rm -rf log


In [None]:
# 

In [None]:
# img_idx.detach().numpy()

In [None]:
# import shutil
# from IPython.display import FileLink

# # Specify the directory you want to compress
# directory_name = 'log'
# zip_filename = 'log.zip'

# # Compress the directory into a zip file, overwriting if it already exists
# shutil.make_archive(zip_filename.replace('.zip', ''), 'zip', directory_name)

# # Optionally generate and display a download link
# print(f"Directory '{directory_name}' has been zipped as '{zip_filename}'.")
# FileLink(zip_filename)


In [None]:

net

In [None]:

/content/results/runs/run 04/epoch 00/batch 0/layer 01

In [None]:
!ls

In [None]:
cd 

In [None]:

import os

# Define the directory path you want to check
directory_path = r'results/runs/run 03/epoch 00/batch 0/layer 01/'

# Check if the directory exists
if os.path.isdir(directory_path):
    print(f"The directory '{directory_path}' exists.")
else:
    print(f"The directory '{directory_path}' does not exist.")


In [None]:
file_path = r'results/runs/run 03/epoch 00/batch 0/layer 01/01_attention_out.npy'

# Load the NumPy array from the file
data = np.load(file_path)

In [None]:
data.shape

In [None]:

data.shape
# shape : batch x head x X x Y