In [None]:
!pip install tqdm
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.0-cp39-cp39-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     ------------------------- ------------ 41.0/60.6 kB 393.8 kB/s eta 0:00:01
     -------------------------------------- 60.6/60.6 kB 402.7 kB/s eta 0:00:00
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.0-cp39-cp39-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/1

In [None]:
# importing libraries
from PIL import Image
import torch
from torch import nn
import os
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import trange, tqdm
from torchvision import models, transforms
import torchvision.transforms.functional as TF


In [None]:
# dataloader with features also output
import os
import torch
import torchvision.transforms as transforms
from PIL import Image
from torchvision.models import resnet50

class oct_data(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        self.labels = []
        self.max = 300
        self.features = []

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Load ResNet model for feature extraction
        resnet = resnet50(pretrained=True)
        self.resnet = torch.nn.Sequential(*(list(resnet.children())[:-1])).to(self.device)

        # resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(224,224,3))
        # resnet.trainable = False
        # x = GlobalAveragePooling2D()(resnet.output)
        # self.resnet = Model(inputs = resnet.input, outputs=x)
        # -----------

        classes = sorted(os.listdir(root_dir))

        for class_idx, folder in enumerate(classes):
            class_dir = os.path.join(root_dir, folder)
            for p in os.listdir(class_dir):
                path = os.path.join(class_dir, p)
                self.data.append(path)
                self.labels.append(class_idx)
                # self.max = max(len(os.listdir(path)), self.max)

        # Preprocess transforms for images
        self.preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        # Precompute and cache features during initialization
        self.compute_features()

    def compute_features(self):
        for data_path in self.data:
            features = []
            for path in os.listdir(data_path):
                image_path = os.path.join(data_path, path)
                img = Image.open(image_path)
                img = img.convert('RGB')
                img = self.preprocess(img)
                # print(img.shape)
                with torch.no_grad():
                    img = img.unsqueeze(0).to(self.device)  # Add batch dimension
                    feature = self.resnet(img)
                features.append(feature)
                # print(feature.shape)
            # Pad or truncate features to a fixed length
            # features = self.pad(features)
            self.features.append(features)

    def pad(self, features):
        max_length = self.max
        while len(features) < max_length:
            features.extend(features[::-1] + features[:])
        return features[:max_length]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.labels[idx]
        original_images = []

        for path in os.listdir(self.data[idx]):
            image_path = os.path.join(self.data[idx], path)
            img = Image.open(image_path)
            original_images.append(img)

        original_images = [transforms.ToTensor()(img) for img in original_images]


        original_images = self.pad(original_images)
        features = self.pad(self.features[idx])

        out = torch.stack(features)
        out = torch.squeeze(out, dim=-1)
        out = torch.squeeze(out, dim=-1) # removing the last 2 dimensions of the features

        return torch.stack(original_images), out, label


In [None]:
def patchify(images, n_patches):
    b, n, c, h, w = images.shape

    assert h == w, "Patchify method is implemented for square images only"

    patches = torch.zeros(b, n, n_patches ** 2, h * w * c // n_patches ** 2)
    patch_size = h // n_patches

    for idx, data_instance in enumerate(images):
      for image_idx, image in enumerate(data_instance):
        for i in range(n_patches):
            for j in range(n_patches):
                patch = image[:, i * patch_size: (i + 1) * patch_size, j * patch_size: (j + 1) * patch_size]
                patches[idx, image_idx, i * n_patches + j] = patch.flatten()
    return patches

def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i / (10000 ** (j / d))) if j % 2 == 0 else np.cos(i / (10000 ** ((j - 1) / d)))
    return result


In [None]:

class SelfAttention(nn.Module):
  def __init__(self, d, n_heads=2):
    super(SelfAttention, self).__init__()
    self.d = d
    self.n_heads = n_heads

    assert d % n_heads == 0

    d_head = int(d / n_heads)

    self.q_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(n_heads)])
    self.k_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(n_heads)])
    self.v_mappings = nn.ModuleList([nn.Linear(d_head, d_head) for _ in range(n_heads)])

    self.d_head = d_head
    self.softmax = nn.Softmax(dim=-1)


  # understand this once? is this correct?
  def forward(self, query, key_value):
        """
        query and key_value have shape (batch_size, seq_length, token_dim)
        The token_dim must be divisible by n_heads
        """

        result = []
        for q, kv in zip(query, key_value):
            seq_result = []
            for head in range(self.n_heads):
                q_mapping = self.q_mappings[head]
                k_mapping = self.k_mappings[head]
                v_mapping = self.v_mappings[head]

                q_1 = q[:, head * self.d_head: (head + 1) * self.d_head]
                kv_1 = kv[:, head * self.d_head : (head + 1) * self.d_head]


                q1, k1, v1 = q_mapping(q_1), k_mapping(kv_1), v_mapping(kv_1)

                attention = self.softmax(q1 @ k1.T / (self.d_head ** 0.5))
                seq_result.append(attention @ v1)
            result.append(torch.hstack(seq_result))
        return torch.cat([torch.unsqueeze(r, dim=0) for r in result])

In [None]:

class AttentionBlock(nn.Module):
  def __init__(self, hidden_d, n_heads, mlp_ratio=4, dropout=0.4):
    super(AttentionBlock, self).__init__()
    self.hidden_d = hidden_d
    self.n_heads = n_heads

    self.norm1 = nn.LayerNorm(hidden_d)
    self.mhsa = SelfAttention(hidden_d, n_heads)
    self.norm2 = nn.LayerNorm(hidden_d)
    self.mlp = nn.Sequential(
        nn.Linear(hidden_d, mlp_ratio * hidden_d),
        nn.GELU(),
        nn.Linear(mlp_ratio * hidden_d, hidden_d),
        nn.Dropout(dropout)
    )

  def forward(self, query, key_value = None):
    if key_value == None:
      key_value = query

    out = query + self.mhsa(self.norm1(query), self.norm1(key_value))
    out = out + self.mlp(self.norm2(out))
    return out

In [None]:
class MyViT(nn.Module): # change input shape and perform transformation inside the block
  def __init__(self, device=torch.device('cpu'), nchw1=(300, 1, 300, 300), nchw2=(300, 1, 2048),
               n_patches=15, hidden_d = 256,
               n_blocks = 1, n_heads = 4, out_d = 3, dropout=0.2):
    # Super constructor
    super(MyViT, self).__init__()

    # Attributes
    self.nchw1 = nchw1 # (N, C, H, W)
    self.nchw2 = nchw2 # (N, C, D)

    self.n_patches = n_patches
    self.hidden_d = hidden_d
    self.device = device

    # part 1 ------------------------------
    assert nchw1[2] % n_patches == 0, "Input shape not entirely divisible by number of patches"
    assert nchw1[3] % n_patches == 0, "Input shape not entirely divisible by number of patches"

    self.patch_size = (nchw1[2] / n_patches, nchw1[3] / n_patches)

    # 1) Linear mapper
    self.input_d1 = int(nchw1[1] * self.patch_size[0] * self.patch_size[1])
    self.linear_mapper1 = nn.Linear(self.input_d1, self.hidden_d)

    self.input_d2 = int(nchw2[1] * nchw2[2])
    self.linear_mapper2 = nn.Linear(self.input_d2, self.hidden_d)

    # 2) Learnable classifiation token
    self.class_token1 = nn.Parameter(torch.rand(1, n_patches**2, self.hidden_d)) # adding a class across the number of images dimension
    self.class_token2 = nn.Parameter(torch.rand(1, self.nchw2[-2],self.hidden_d))

    # 3) Positional embedding
    self.pos_embed = nn.Parameter(torch.tensor(get_positional_embeddings(self.n_patches ** 2, self.hidden_d).clone().detach()))
    # self.pos_embed = nn.Parameter(torch.tensor(get_positional_embeddings(self.n_patches ** 2 + 1, self.hidden_d).clone().detach()))
    self.pos_embed.requires_grad = False

    # 4 transformer
    self.blocks = nn.ModuleList([AttentionBlock(hidden_d, n_heads) for _ in range(n_blocks)])


    # # 5 classification block
    self.mlp = nn.Sequential(
        nn.Linear(self.hidden_d, out_d),
        nn.Dropout(dropout),
        nn.Softmax(dim=-1)
        # nn.Sigmoid()
    )

    self.apply(self._init_weights)
  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.constant_(module.bias, 0)
    elif isinstance(module, nn.LayerNorm):
        nn.init.constant_(module.bias, 0)
        nn.init.constant_(module.weight, 1.0)


  def forward(self, data1, data2):

    # part 1 -------- start
    # print(data1.shape, data2.shape)
    b1, n1, c1, h1, w1 = data1.shape
    b2, n2, c2, d2 = data2.shape

    patches = patchify(data1, self.n_patches).to(self.device)
    tokens1 = self.linear_mapper1(patches)

    tokens2 = self.linear_mapper2(data2)

    tokens2 = tokens2.view(b2, n2*c2, 512)


    # Adding classification token to the tokens

    tokens2 = torch.cat((self.class_token2.expand(b2, 1, -1), tokens2), dim=1)

    # tokens2 = torch.stack([torch.vstack((self.class_token2, tokens2[i])) for i in range(len(tokens2))])

    # print(tokens1.shape, tokens2.shape)
    # for batch in range(b):
    # Adding positional embedding
    pos_embed = self.pos_embed.repeat(b1, n1, 1, 1)
    tokens1 = tokens1 + pos_embed

    # tokens1 = torch.cat((self.class_token1.expand(b1, 1, tokens1.shape[2], 512), tokens1), dim=1)
    tokens1 = torch.stack([torch.vstack((self.class_token1, tokens1[i])) for i in range(len(tokens1))])


    # pos_embed = self.pos_embed.repeat(b2, n2+1, 1, 1)
    # tokens2 = tokens2 + pos_embed

    b_1, n_1, p_1, d_1 = tokens1.shape
    # b_2, n_2, c_2, d_2 = tokens2.shape




    for block in self.blocks:
      tokens2 = block(tokens2)

    print("self attention 2 done")
    new = []

    for i in range(b_1):
      out_ = tokens1[i]
      for block in self.blocks:
        out_ = block(out_)

      new.append(out_)

    print("self attention 1 done")
    tokens1 = torch.stack(new)
    # print(tokens1.shape)

    tokens1 = tokens1.view(b_1, n_1*p_1, d_1)




    # print(tokens1.shape, tokens2.shape)
    # cross attention


    for block in self.blocks:
      out_new = block(tokens2, tokens1)

    print('cross attention 1 done')
    # print(out_new.shape)

    # again self attention
    for block in self.blocks:
      out_new = block(out_new)

    # print(out_new.shape)
    print('self attention 3 done')

    out_new = out_new[:,0]

    return self.mlp(out_new)



In [None]:
# model = MyViT((300, 1, 300, 300), n_patches=15, n_blocks=2, hidden_d=8, n_heads=2, out_d=3)
# model(data)
torch.cuda.empty_cache()
from torch.optim.lr_scheduler import StepLR

In [None]:
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
#root_dir = "C:/Users/SCAAI_Vaibhv/Desktop/arya/rcnn transformer/data-20240615T041314Z-001/data"
#dataset = oct_data(root_dir)


#train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42)

all_labels = []
for _, _, label in dataset:
    all_labels.append(label)
all_labels = torch.tensor(all_labels)

dataloader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True)

#_, _, labels = dataset
class_counts = torch.bincount(all_labels)
print(len(class_counts))
class_weights = 1. / class_counts.float()

# If you have 3 classes, for example:
class_weights = class_weights / class_weights.sum()
print(class_weights)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
#model = MyViT(device, n_patches=15, n_blocks=2, hidden_d=512, n_heads=4, out_d=3).to(device)
#model.load_state_dict(torch.load('model_5.pt'))
train_losses = []
train_accuracies = []
#
def train(dataloader, test_loader, model):



    N_EPOCHS = 100
    LR = 0.00089
    optimizer = Adam(model.parameters(), lr=LR, weight_decay=1e-5)
    scheduler = StepLR(optimizer, step_size=4, gamma=0.1)
    #criterion = nn.CrossEntropyLoss()




    for epoch in trange(N_EPOCHS, desc="Training"):
        train_loss = 0.0
        correct = 0
        total = 0
        model.train()
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1} in training", leave=False):
            x1, x2, y = batch
            x1, x2, y = x1.to(device), x2.to(device), y.to(device)

            optimizer.zero_grad()
            y_hat = model(x1, x2)

            print(y_hat, y)

            loss = criterion(y_hat, y)
            train_loss += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            _, predicted = torch.max(y_hat.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

        train_loss /= len(dataloader)
        train_accuracy = correct / total
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)


        scheduler.step()
        if (epoch+1)%5 == 0:
            torch.save(model.state_dict(), f"model_{epoch+1}.pt")

    torch.save(model.state_dict(), f"model_100.pt")


if __name__ == "__main__":
    train(dataloader, test_loader, model)

3
tensor([0.2564, 0.3590, 0.3846])


Training:   0%|                                                                                | 0/100 [00:00<?, ?it/s]
Epoch 1 in training:   0%|                                                                      | 0/70 [00:00<?, ?it/s][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0., 0., 1.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:   1%|▉                                                             | 1/70 [00:16<18:34, 16.16s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0., 0., 1.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:   3%|█▊                                                            | 2/70 [00:32<18:18, 16.15s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.0000e+00, 3.2931e-43, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:   4%|██▋                                                           | 3/70 [00:47<17:20, 15.54s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.5000, 0.0000, 0.5000]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:   6%|███▌                                                          | 4/70 [01:02<16:56, 15.41s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0., 0., 1.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:   7%|████▍                                                         | 5/70 [01:17<16:28, 15.21s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[2.0039e-43, 2.1193e-20, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:   9%|█████▎                                                        | 6/70 [01:32<16:14, 15.22s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[3.0391e-22, 1.7082e-13, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  10%|██████▏                                                       | 7/70 [01:47<15:49, 15.07s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.2089e-40, 1.8559e-33, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  11%|███████                                                       | 8/70 [02:02<15:32, 15.04s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[4.3548e-30, 2.5149e-24, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  13%|███████▉                                                      | 9/70 [02:17<15:17, 15.05s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[2.6883e-08, 1.1489e-06, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  14%|████████▋                                                    | 10/70 [02:32<15:15, 15.26s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.2506e-13, 5.7884e-18, 1.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  16%|█████████▌                                                   | 11/70 [02:49<15:23, 15.65s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[7.2680e-08, 5.0000e-01, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  17%|██████████▍                                                  | 12/70 [03:06<15:28, 16.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[8.1828e-05, 9.9992e-01, 1.3115e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  19%|███████████▎                                                 | 13/70 [03:21<15:06, 15.90s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.5152e-15, 3.7588e-23]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  20%|████████████▏                                                | 14/70 [03:37<14:43, 15.78s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.4187e-18, 4.3060e-28]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  21%|█████████████                                                | 15/70 [03:52<14:13, 15.53s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.3801e-20, 5.7988e-33]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  23%|█████████████▉                                               | 16/70 [04:07<13:57, 15.52s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.2480e-21, 2.0131e-33]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  24%|██████████████▊                                              | 17/70 [04:22<13:30, 15.30s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.6253e-22, 3.9033e-34]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  26%|███████████████▋                                             | 18/70 [04:37<13:08, 15.17s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 8.2212e-24, 2.9978e-32]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  27%|████████████████▌                                            | 19/70 [04:52<12:47, 15.04s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 8.4999e-08]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  29%|█████████████████▍                                           | 20/70 [05:07<12:27, 14.95s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.9665e-17, 8.9023e-26]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  30%|██████████████████▎                                          | 21/70 [05:22<12:13, 14.97s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.6159e-14, 1.5514e-20]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  31%|███████████████████▏                                         | 22/70 [05:36<11:53, 14.87s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.5165e-11, 2.6135e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  33%|████████████████████                                         | 23/70 [05:51<11:35, 14.79s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.4873, 0.4873, 0.0254]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  34%|████████████████████▉                                        | 24/70 [06:06<11:19, 14.78s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.6773e-17, 1.0251e-25]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  36%|█████████████████████▊                                       | 25/70 [06:20<10:59, 14.66s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.1939e-24, 3.7268e-35]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  37%|██████████████████████▋                                      | 26/70 [06:35<10:49, 14.76s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.1102e-26, 2.6469e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  39%|███████████████████████▌                                     | 27/70 [06:50<10:41, 14.92s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.9214e-27, 1.1280e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  40%|████████████████████████▍                                    | 28/70 [07:05<10:23, 14.85s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.8300e-29, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  41%|█████████████████████████▎                                   | 29/70 [07:19<10:04, 14.76s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.8081e-29, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  43%|██████████████████████████▏                                  | 30/70 [07:34<09:52, 14.82s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.0077e-29, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  44%|███████████████████████████                                  | 31/70 [07:49<09:34, 14.74s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.0435e-29, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  46%|███████████████████████████▉                                 | 32/70 [08:03<09:09, 14.45s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.4373e-31, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  47%|████████████████████████████▊                                | 33/70 [08:18<08:59, 14.57s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.7106e-33, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  49%|█████████████████████████████▋                               | 34/70 [08:33<08:51, 14.76s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.4047e-34, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  50%|██████████████████████████████▌                              | 35/70 [08:48<08:37, 14.79s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.2375e-31, 2.2375e-31]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  51%|███████████████████████████████▎                             | 36/70 [09:02<08:19, 14.70s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.2261e-34, 6.1808e-32]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  53%|████████████████████████████████▏                            | 37/70 [09:17<08:09, 14.83s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.2344e-35, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  54%|█████████████████████████████████                            | 38/70 [09:32<07:50, 14.70s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 1.9302e-26]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  56%|█████████████████████████████████▉                           | 39/70 [09:47<07:38, 14.77s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.1036e-35, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  57%|██████████████████████████████████▊                          | 40/70 [10:02<07:24, 14.82s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.7133e-31, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  59%|███████████████████████████████████▋                         | 41/70 [10:16<07:03, 14.61s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.8269e-38, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  60%|████████████████████████████████████▌                        | 42/70 [10:31<06:54, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.6075e-36, 1.3478e-32]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  61%|█████████████████████████████████████▍                       | 43/70 [10:46<06:38, 14.74s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1105e-35, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  63%|██████████████████████████████████████▎                      | 44/70 [11:01<06:28, 14.93s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 8.6501e-34, 4.4702e-30]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  64%|███████████████████████████████████████▏                     | 45/70 [11:15<06:10, 14.83s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.4437e-34, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  66%|████████████████████████████████████████                     | 46/70 [11:30<05:52, 14.69s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[9.9641e-01, 3.5951e-03, 3.1798e-29]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  67%|████████████████████████████████████████▉                    | 47/70 [11:44<05:37, 14.67s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.5515e-37, 3.2871e-31]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  69%|█████████████████████████████████████████▊                   | 48/70 [12:00<05:26, 14.83s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.1687e-33, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  70%|██████████████████████████████████████████▋                  | 49/70 [12:15<05:12, 14.89s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 8.4494e-39, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  71%|███████████████████████████████████████████▌                 | 50/70 [12:28<04:50, 14.54s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.7049e-40, 4.4890e-23]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  73%|████████████████████████████████████████████▍                | 51/70 [12:43<04:34, 14.43s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.8706e-41, 3.1165e-21]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  74%|█████████████████████████████████████████████▎               | 52/70 [12:57<04:19, 14.41s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.7393e-22, 1.3929e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  76%|██████████████████████████████████████████████▏              | 53/70 [13:12<04:06, 14.50s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  77%|███████████████████████████████████████████████              | 54/70 [13:26<03:52, 14.51s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.3300e-28, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  79%|███████████████████████████████████████████████▉             | 55/70 [13:41<03:38, 14.58s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.0864e-15, 1.0864e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  80%|████████████████████████████████████████████████▊            | 56/70 [13:55<03:23, 14.54s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.5216e-23, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  81%|█████████████████████████████████████████████████▋           | 57/70 [14:10<03:07, 14.46s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  83%|██████████████████████████████████████████████████▌          | 58/70 [14:25<02:55, 14.65s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  84%|███████████████████████████████████████████████████▍         | 59/70 [14:41<02:47, 15.26s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  86%|████████████████████████████████████████████████████▎        | 60/70 [14:56<02:31, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.9043e-17, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  87%|█████████████████████████████████████████████████████▏       | 61/70 [15:12<02:17, 15.30s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.3661e-38, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  89%|██████████████████████████████████████████████████████       | 62/70 [15:28<02:05, 15.63s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  90%|██████████████████████████████████████████████████████▉      | 63/70 [15:44<01:49, 15.60s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  91%|███████████████████████████████████████████████████████▊     | 64/70 [15:59<01:32, 15.34s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  93%|████████████████████████████████████████████████████████▋    | 65/70 [16:13<01:15, 15.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 1 in training:  94%|█████████████████████████████████████████████████████████▌   | 66/70 [16:28<00:59, 14.94s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  96%|██████████████████████████████████████████████████████████▍  | 67/70 [16:43<00:45, 15.16s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training:  97%|███████████████████████████████████████████████████████████▎ | 68/70 [16:58<00:30, 15.11s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1487e-14, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 1 in training:  99%|████████████████████████████████████████████████████████████▏| 69/70 [17:13<00:14, 14.95s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 1 in training: 100%|█████████████████████████████████████████████████████████████| 70/70 [17:28<00:00, 14.98s/it][A
Training:   1%|▋                                                                  | 1/100 [17:28<28:50:07, 1048.56s/it][A
Epoch 2 in training:   0%|                                                                      | 0/70 [00:00<?, ?it/s][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.1365e-43, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:   1%|▉                                                             | 1/70 [00:16<18:43, 16.28s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 2.9399e-42, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:   3%|█▊                                                            | 2/70 [00:33<19:16, 17.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.3155e-20, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:   4%|██▋                                                           | 3/70 [00:51<19:09, 17.16s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.2604e-14, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:   6%|███▌                                                          | 4/70 [01:07<18:40, 16.97s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 1.2373e-20]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:   7%|████▍                                                         | 5/70 [01:23<17:53, 16.52s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.3165e-13, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:   9%|█████▎                                                        | 6/70 [01:39<17:27, 16.37s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.0078e-16, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  10%|██████▏                                                       | 7/70 [01:55<16:56, 16.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 8.2013e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  11%|███████                                                       | 8/70 [02:11<16:38, 16.10s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  13%|███████▉                                                      | 9/70 [02:27<16:26, 16.17s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  14%|████████▋                                                    | 10/70 [02:43<16:06, 16.11s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  16%|█████████▌                                                   | 11/70 [02:59<15:51, 16.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  17%|██████████▍                                                  | 12/70 [03:15<15:28, 16.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.0062e-21, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  19%|███████████▎                                                 | 13/70 [03:31<15:14, 16.05s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  20%|████████████▏                                                | 14/70 [03:46<14:45, 15.81s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.0844e-12, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  21%|█████████████                                                | 15/70 [04:02<14:31, 15.84s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  23%|█████████████▉                                               | 16/70 [04:18<14:19, 15.91s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  24%|██████████████▊                                              | 17/70 [04:35<14:10, 16.06s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.8240e-18, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  26%|███████████████▋                                             | 18/70 [04:51<13:59, 16.14s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 2.8797e-23]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  27%|████████████████▌                                            | 19/70 [05:08<13:51, 16.31s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  29%|█████████████████▍                                           | 20/70 [05:24<13:39, 16.38s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 4.4323e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  30%|██████████████████▎                                          | 21/70 [05:40<13:16, 16.26s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  31%|███████████████████▏                                         | 22/70 [05:57<12:59, 16.24s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  33%|████████████████████                                         | 23/70 [06:13<12:51, 16.42s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 6.2691e-19]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  34%|████████████████████▉                                        | 24/70 [06:30<12:38, 16.49s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 5.2462e-13]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  36%|█████████████████████▊                                       | 25/70 [06:46<12:17, 16.39s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.5000, 0.5000, 0.0000]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  37%|██████████████████████▋                                      | 26/70 [07:02<11:53, 16.21s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  39%|███████████████████████▌                                     | 27/70 [07:19<11:41, 16.31s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 4.3140e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  40%|████████████████████████▍                                    | 28/70 [07:35<11:24, 16.29s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.5000, 0.5000, 0.0000]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  41%|█████████████████████████▎                                   | 29/70 [07:51<11:10, 16.34s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1210e-44, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  43%|██████████████████████████▏                                  | 30/70 [08:07<10:48, 16.20s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  44%|███████████████████████████                                  | 31/70 [08:23<10:22, 15.97s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 9.7435e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  46%|███████████████████████████▉                                 | 32/70 [08:38<09:58, 15.74s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  47%|████████████████████████████▊                                | 33/70 [08:53<09:34, 15.53s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.7778e-19, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  49%|█████████████████████████████▋                               | 34/70 [09:08<09:14, 15.40s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  50%|██████████████████████████████▌                              | 35/70 [09:23<08:55, 15.31s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  51%|███████████████████████████████▎                             | 36/70 [09:39<08:43, 15.38s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  53%|████████████████████████████████▏                            | 37/70 [09:54<08:28, 15.41s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  54%|█████████████████████████████████                            | 38/70 [10:10<08:20, 15.64s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 4.3619e-13]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  56%|█████████████████████████████████▉                           | 39/70 [10:26<08:11, 15.84s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.8031e-14, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  57%|██████████████████████████████████▊                          | 40/70 [10:42<07:54, 15.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.7299e-38, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  59%|███████████████████████████████████▋                         | 41/70 [10:58<07:39, 15.85s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  60%|████████████████████████████████████▌                        | 42/70 [11:14<07:24, 15.89s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.8026e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  61%|█████████████████████████████████████▍                       | 43/70 [11:30<07:11, 15.99s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  63%|██████████████████████████████████████▎                      | 44/70 [11:47<06:58, 16.10s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.2132e-36, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  64%|███████████████████████████████████████▏                     | 45/70 [12:04<06:52, 16.50s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 4.7384e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  66%|████████████████████████████████████████                     | 46/70 [12:20<06:33, 16.38s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.8751e-37, 4.2039e-45]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  67%|████████████████████████████████████████▉                    | 47/70 [12:35<06:08, 16.03s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 0.0000e+00, 8.2749e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  69%|█████████████████████████████████████████▊                   | 48/70 [12:50<05:45, 15.71s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.6567e-14, 2.6567e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  70%|██████████████████████████████████████████▋                  | 49/70 [13:06<05:30, 15.72s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  71%|███████████████████████████████████████████▌                 | 50/70 [13:24<05:27, 16.36s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-44, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  73%|████████████████████████████████████████████▍                | 51/70 [13:41<05:16, 16.64s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  74%|█████████████████████████████████████████████▎               | 52/70 [13:58<04:58, 16.58s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  76%|██████████████████████████████████████████████▏              | 53/70 [14:14<04:39, 16.42s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.7629e-15, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  77%|███████████████████████████████████████████████              | 54/70 [14:30<04:23, 16.45s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.0910e-14, 2.0910e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  79%|███████████████████████████████████████████████▉             | 55/70 [14:47<04:08, 16.59s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-45, 3.5949e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  80%|████████████████████████████████████████████████▊            | 56/70 [15:03<03:50, 16.43s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.8656e-33, 7.4769e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  81%|█████████████████████████████████████████████████▋           | 57/70 [15:19<03:29, 16.09s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.9275e-43, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  83%|██████████████████████████████████████████████████▌          | 58/70 [15:33<03:07, 15.60s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1499e-34, 6.0621e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  84%|███████████████████████████████████████████████████▍         | 59/70 [15:48<02:49, 15.43s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  86%|████████████████████████████████████████████████████▎        | 60/70 [16:03<02:31, 15.19s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.5371e-32, 1.9172e-37]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  87%|█████████████████████████████████████████████████████▏       | 61/70 [16:18<02:15, 15.10s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  89%|██████████████████████████████████████████████████████       | 62/70 [16:32<02:00, 15.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  90%|██████████████████████████████████████████████████████▉      | 63/70 [16:47<01:44, 14.98s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  91%|███████████████████████████████████████████████████████▊     | 64/70 [17:02<01:29, 14.97s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training:  93%|████████████████████████████████████████████████████████▋    | 65/70 [17:17<01:14, 14.89s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.7890e-18, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  94%|█████████████████████████████████████████████████████████▌   | 66/70 [17:32<00:59, 14.91s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.8802e-12, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  96%|██████████████████████████████████████████████████████████▍  | 67/70 [17:47<00:44, 14.98s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.3833e-42, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 2 in training:  97%|███████████████████████████████████████████████████████████▎ | 68/70 [18:02<00:29, 14.91s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 2 in training:  99%|████████████████████████████████████████████████████████████▏| 69/70 [18:17<00:14, 14.99s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 2 in training: 100%|█████████████████████████████████████████████████████████████| 70/70 [18:32<00:00, 15.05s/it][A
Training:   2%|█▎                                                                 | 2/100 [36:01<29:34:14, 1086.27s/it][A
Epoch 3 in training:   0%|                                                                      | 0/70 [00:00<?, ?it/s][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:   1%|▉                                                             | 1/70 [00:16<18:31, 16.11s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.2827e-30, 1.8170e-38]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:   3%|█▊                                                            | 2/70 [00:31<17:45, 15.67s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:   4%|██▋                                                           | 3/70 [00:46<17:07, 15.34s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:   6%|███▌                                                          | 4/70 [01:01<16:52, 15.35s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.4852e-42, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:   7%|████▍                                                         | 5/70 [01:16<16:17, 15.03s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4013e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:   9%|█████▎                                                        | 6/70 [01:31<16:00, 15.02s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 8.5172e-33, 5.2789e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  10%|██████▏                                                       | 7/70 [01:46<15:42, 14.96s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.0065e-45, 1.0065e-11]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  11%|███████                                                       | 8/70 [02:01<15:38, 15.14s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.6197e-42, 7.3768e-13]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  13%|███████▉                                                      | 9/70 [02:16<15:14, 14.99s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  14%|████████▋                                                    | 10/70 [02:31<14:58, 14.98s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.8091e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  16%|█████████▌                                                   | 11/70 [02:46<14:45, 15.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  17%|██████████▍                                                  | 12/70 [03:01<14:33, 15.05s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4084e-12, 1.4084e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  19%|███████████▎                                                 | 13/70 [03:16<14:21, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.5801e-14, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  20%|████████████▏                                                | 14/70 [03:32<14:15, 15.28s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.2799e-12, 2.2799e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  21%|█████████████                                                | 15/70 [03:48<14:06, 15.38s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.8026e-45, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  23%|█████████████▉                                               | 16/70 [04:03<13:53, 15.44s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.8021e-30, 2.1613e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  24%|██████████████▊                                              | 17/70 [04:19<13:40, 15.48s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1., 0., 0.]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  26%|███████████████▋                                             | 18/70 [04:34<13:26, 15.51s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.9618e-44, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  27%|████████████████▌                                            | 19/70 [04:49<12:58, 15.27s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.3899e-42, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  29%|█████████████████▍                                           | 20/70 [05:04<12:39, 15.19s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.6062e-41, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  30%|██████████████████▎                                          | 21/70 [05:19<12:22, 15.16s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.2761e-41, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  31%|███████████████████▏                                         | 22/70 [05:34<12:03, 15.08s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 6.1511e-31, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  33%|████████████████████                                         | 23/70 [05:49<11:53, 15.17s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4674e-15, 1.4674e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  34%|████████████████████▉                                        | 24/70 [06:04<11:34, 15.11s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.0375e-28, 2.3394e-34]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  36%|█████████████████████▊                                       | 25/70 [06:19<11:20, 15.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.0472e-40, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  37%|██████████████████████▋                                      | 26/70 [06:34<11:04, 15.09s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.1099e-27, 1.0583e-34]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  39%|███████████████████████▌                                     | 27/70 [06:50<10:49, 15.09s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.3574e-26, 3.8733e-31]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  40%|████████████████████████▍                                    | 28/70 [07:05<10:35, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 3.7145e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  41%|█████████████████████████▎                                   | 29/70 [07:20<10:21, 15.16s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.2086e-12, 1.4013e-45]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  43%|██████████████████████████▏                                  | 30/70 [07:35<10:07, 15.19s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.5860e-27, 1.5076e-35]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  44%|███████████████████████████                                  | 31/70 [07:51<09:54, 15.25s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 1.8106e-38]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  46%|███████████████████████████▉                                 | 32/70 [08:05<09:34, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.0547e-15, 3.0547e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  47%|████████████████████████████▊                                | 33/70 [08:20<09:16, 15.03s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.3237e-26, 1.0867e-33]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  49%|█████████████████████████████▋                               | 34/70 [08:36<09:05, 15.15s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.6420e-12, 3.6420e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  50%|██████████████████████████████▌                              | 35/70 [08:51<08:49, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.5395e-14, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  51%|███████████████████████████████▎                             | 36/70 [09:06<08:33, 15.10s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 3.0207e-28, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  53%|████████████████████████████████▏                            | 37/70 [09:21<08:17, 15.09s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.7717e-37, 8.8868e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  54%|█████████████████████████████████                            | 38/70 [09:36<08:04, 15.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.2125e-27, 9.0182e-35]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  56%|█████████████████████████████████▉                           | 39/70 [09:51<07:46, 15.05s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.0972e-34, 1.9274e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  57%|██████████████████████████████████▊                          | 40/70 [10:06<07:31, 15.06s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1319e-38, 1.2612e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  59%|███████████████████████████████████▋                         | 41/70 [10:21<07:18, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.3839e-38, 7.3697e-13]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  60%|████████████████████████████████████▌                        | 42/70 [10:36<07:00, 15.03s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[0.3333, 0.3333, 0.3333]], device='cuda:0', grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  61%|█████████████████████████████████████▍                       | 43/70 [10:51<06:44, 15.00s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.7938e-25, 5.6476e-31]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  63%|██████████████████████████████████████▎                      | 44/70 [11:07<06:33, 15.14s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.0084e-11, 4.2039e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  64%|███████████████████████████████████████▏                     | 45/70 [11:22<06:19, 15.17s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.3965e-40, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  66%|████████████████████████████████████████                     | 46/70 [11:37<06:04, 15.18s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.4480e-41, 2.4578e-15]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  67%|████████████████████████████████████████▉                    | 47/70 [11:52<05:46, 15.06s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.3882e-38, 1.4013e-45]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  69%|█████████████████████████████████████████▊                   | 48/70 [12:07<05:32, 15.12s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4943e-12, 9.2906e-43]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  70%|██████████████████████████████████████████▋                  | 49/70 [12:22<05:14, 14.99s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.8492e-26, 1.5543e-32]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  71%|███████████████████████████████████████████▌                 | 50/70 [12:37<04:59, 14.96s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.3446e-39, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  73%|████████████████████████████████████████████▍                | 51/70 [12:52<04:44, 14.99s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.6301e-38, 7.0065e-45]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  74%|█████████████████████████████████████████████▎               | 52/70 [13:07<04:30, 15.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.6479e-36, 2.6639e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  76%|██████████████████████████████████████████████▏              | 53/70 [13:22<04:14, 14.95s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.9697e-36, 8.3798e-43]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  77%|███████████████████████████████████████████████              | 54/70 [13:37<04:00, 15.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.1627e-37, 3.7835e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  79%|███████████████████████████████████████████████▉             | 55/70 [13:52<03:45, 15.02s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.1427e-38, 3.9236e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  80%|████████████████████████████████████████████████▊            | 56/70 [14:06<03:28, 14.91s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.5530e-37, 3.9937e-43]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  81%|█████████████████████████████████████████████████▋           | 57/70 [14:21<03:14, 14.96s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.3168e-39, 0.0000e+00]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  83%|██████████████████████████████████████████████████▌          | 58/70 [14:36<02:57, 14.83s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.7813e-24, 1.1296e-30]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  84%|███████████████████████████████████████████████████▍         | 59/70 [14:51<02:42, 14.81s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.0411e-24, 1.6052e-29]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 3 in training:  86%|████████████████████████████████████████████████████▎        | 60/70 [15:06<02:28, 14.82s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.8029e-36, 5.5707e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  87%|█████████████████████████████████████████████████████▏       | 61/70 [15:20<02:13, 14.85s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.0276e-36, 1.6816e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  89%|██████████████████████████████████████████████████████       | 62/70 [15:35<01:58, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.9551e-13, 1.0403e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  90%|██████████████████████████████████████████████████████▉      | 63/70 [15:50<01:43, 14.77s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.3527e-16, 4.3527e-16]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  91%|███████████████████████████████████████████████████████▊     | 64/70 [16:04<01:27, 14.62s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.8977e-36, 7.2410e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  93%|████████████████████████████████████████████████████████▋    | 65/70 [16:19<01:13, 14.61s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4790e-32, 1.7180e-38]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  94%|█████████████████████████████████████████████████████████▌   | 66/70 [16:33<00:58, 14.66s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 2.3996e-30]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  96%|██████████████████████████████████████████████████████████▍  | 67/70 [16:48<00:44, 14.70s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.5731e-32, 2.1805e-10]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training:  97%|███████████████████████████████████████████████████████████▎ | 68/70 [17:04<00:29, 14.91s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.2836e-12, 1.2836e-12]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 3 in training:  99%|████████████████████████████████████████████████████████████▏| 69/70 [17:18<00:14, 14.79s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.5033e-35, 1.5809e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 3 in training: 100%|█████████████████████████████████████████████████████████████| 70/70 [17:33<00:00, 14.72s/it][A
Training:   3%|██                                                                 | 3/100 [53:34<28:51:45, 1071.19s/it][A
Epoch 4 in training:   0%|                                                                      | 0/70 [00:00<?, ?it/s][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.4521e-37, 6.0256e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:   1%|▉                                                             | 1/70 [00:15<18:06, 15.75s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.2222e-13, 6.2890e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:   3%|█▊                                                            | 2/70 [00:30<17:10, 15.15s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.5211e-21, 6.8102e-29]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:   4%|██▋                                                           | 3/70 [00:45<16:49, 15.06s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.4926e-36, 3.1459e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:   6%|███▌                                                          | 4/70 [01:00<16:29, 15.00s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.4965e-34, 3.5430e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:   7%|████▍                                                         | 5/70 [01:15<16:23, 15.13s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4602e-37, 1.4013e-45]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:   9%|█████▎                                                        | 6/70 [01:30<16:04, 15.07s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4272e-36, 4.0638e-44]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  10%|██████▏                                                       | 7/70 [01:45<15:37, 14.88s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.0511e-30, 2.4017e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  11%|███████                                                       | 8/70 [01:59<15:17, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 8.4961e-32, 1.0163e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  13%|███████▉                                                      | 9/70 [02:14<15:00, 14.76s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.7780e-32, 3.3525e-37]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  14%|████████▋                                                    | 10/70 [02:29<14:49, 14.82s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 3.1847e-35, 2.5703e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  16%|█████████▌                                                   | 11/70 [02:44<14:36, 14.86s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.3781e-36, 2.4088e-14]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  17%|██████████▍                                                  | 12/70 [02:58<14:17, 14.78s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.6786e-33, 5.8304e-37]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  19%|███████████▎                                                 | 13/70 [03:13<14:03, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.7834e-21, 5.0000e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  20%|████████████▏                                                | 14/70 [03:29<13:56, 14.93s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.9451e-31, 2.4128e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  21%|█████████████                                                | 15/70 [03:43<13:38, 14.87s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.8540e-34, 3.2734e-41]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  23%|█████████████▉                                               | 16/70 [03:58<13:22, 14.86s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.9235e-33, 7.2719e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  24%|██████████████▊                                              | 17/70 [04:13<13:03, 14.78s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.0503e-33, 9.5061e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  26%|███████████████▋                                             | 18/70 [04:27<12:45, 14.72s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 2.2950e-27]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  27%|████████████████▌                                            | 19/70 [04:42<12:30, 14.72s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.3936e-35, 1.8483e-42]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  29%|█████████████████▍                                           | 20/70 [04:57<12:23, 14.87s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 6.9199e-21, 9.3020e-27]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  30%|██████████████████▎                                          | 21/70 [05:12<12:05, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.7271e-14, 5.0517e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([2], device='cuda:0')



Epoch 4 in training:  31%|███████████████████▏                                         | 22/70 [05:27<12:00, 15.01s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.9188e-21, 2.8848e-27]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  33%|████████████████████                                         | 23/70 [05:42<11:41, 14.92s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.5993e-29, 2.8046e-10]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  34%|████████████████████▉                                        | 24/70 [05:57<11:27, 14.95s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.6791e-12, 9.0805e-36]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  36%|█████████████████████▊                                       | 25/70 [06:12<11:08, 14.86s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.4898e-33, 3.4440e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  37%|██████████████████████▋                                      | 26/70 [06:27<10:57, 14.95s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.2785e-31, 1.5153e-37]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  39%|███████████████████████▌                                     | 27/70 [06:42<10:48, 15.08s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 5.3466e-34, 2.3820e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  40%|████████████████████████▍                                    | 28/70 [06:57<10:24, 14.87s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.0494e-28, 1.0720e-32]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  41%|█████████████████████████▎                                   | 29/70 [07:11<10:05, 14.78s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 7.9859e-12, 2.9820e-39]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  43%|██████████████████████████▏                                  | 30/70 [07:26<09:51, 14.80s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.4807e-32, 1.0206e-38]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  44%|███████████████████████████                                  | 31/70 [07:43<09:58, 15.35s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.3778e-31, 1.6651e-39]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  46%|███████████████████████████▉                                 | 32/70 [08:00<10:06, 15.96s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 9.9155e-32, 2.7242e-38]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  47%|████████████████████████████▊                                | 33/70 [08:17<10:01, 16.27s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.6926e-32, 1.5752e-40]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  49%|█████████████████████████████▋                               | 34/70 [08:33<09:42, 16.19s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 4.7384e-30, 1.7936e-11]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  50%|██████████████████████████████▌                              | 35/70 [08:48<09:15, 15.87s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 2.3946e-30, 1.3364e-37]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  51%|███████████████████████████████▎                             | 36/70 [09:03<08:46, 15.49s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[1.0000e+00, 1.8336e-12, 7.3231e-39]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([0], device='cuda:0')



Epoch 4 in training:  53%|████████████████████████████████▏                            | 37/70 [09:18<08:31, 15.49s/it][A

self attention 2 done
self attention 1 done
cross attention 1 done
self attention 3 done
tensor([[5.0000e-01, 5.0000e-01, 2.0060e-26]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>) tensor([1], device='cuda:0')



Epoch 4 in training:  54%|█████████████████████████████████                            | 38/70 [09:35<08:27, 15.86s/it][A
Training:   3%|█▉                                                               | 3/100 [1:03:14<34:04:50, 1264.85s/it][A


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), f"model_3.pt")
model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
    for inputs1, inputs2, labels in test_loader:
        inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)
        outputs = model(inputs1, inputs2)
        loss = criterion(outputs, labels)
        val_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

val_loss /= len(test_loader)
val_accuracy = correct / total
val_losses.append(val_loss)
val_accuracies.append(val_accuracy)

print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f} , test_loss: {val_loss} , accuracy: {train_accuracy:.2f} , test_accuracy: {val_accuracy}")


IndentationError: unexpected indent (3938787756.py, line 3)

In [None]:
# # Test loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model = MyViT(device, n_patches=15, n_blocks=2, hidden_d=512, n_heads=4, out_d=3).to(device)
#model.load_state_dict(torch.load('model_100.pt'))
#criterion = CrossEntropyLoss()

with torch.no_grad():
    correct, total = 0, 0
    test_loss = 0.0
    for batch in tqdm(test_loader, desc="Testing"):
        x1, x2, y = batch
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        y_hat = model(x1, x2)
        loss = criterion(y_hat, y)
        test_loss += loss.detach().cpu().item() / len(test_loader)

        correct += torch.sum(torch.argmax(y_hat, dim=1) == y).detach().cpu().item()
        total += len(x1)
    print(f"Test loss: {test_loss:.2f}")
    print(f"Test accuracy: {correct / total * 100:.2f}%")


Testing:   0%|                                                                                  | 0/30 [00:00<?, ?it/s]

self attention 2 done


Testing:   3%|██▍                                                                       | 1/30 [00:12<06:08, 12.71s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:   7%|████▉                                                                     | 2/30 [00:24<05:37, 12.05s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  10%|███████▍                                                                  | 3/30 [00:35<05:18, 11.80s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  13%|█████████▊                                                                | 4/30 [00:47<05:08, 11.88s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  17%|████████████▎                                                             | 5/30 [01:00<05:05, 12.24s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  20%|██████████████▊                                                           | 6/30 [01:12<04:50, 12.12s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  23%|█████████████████▎                                                        | 7/30 [01:23<04:31, 11.81s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  27%|███████████████████▋                                                      | 8/30 [01:34<04:12, 11.49s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  30%|██████████████████████▏                                                   | 9/30 [01:45<03:55, 11.21s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  33%|████████████████████████▎                                                | 10/30 [01:56<03:44, 11.22s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  37%|██████████████████████████▊                                              | 11/30 [02:07<03:35, 11.34s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  40%|█████████████████████████████▏                                           | 12/30 [02:18<03:21, 11.18s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  43%|███████████████████████████████▋                                         | 13/30 [02:29<03:06, 11.00s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  47%|██████████████████████████████████                                       | 14/30 [02:40<02:55, 10.97s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  50%|████████████████████████████████████▌                                    | 15/30 [02:51<02:46, 11.12s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  53%|██████████████████████████████████████▉                                  | 16/30 [03:05<02:45, 11.86s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  57%|█████████████████████████████████████████▎                               | 17/30 [03:18<02:38, 12.19s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  60%|███████████████████████████████████████████▊                             | 18/30 [03:30<02:25, 12.15s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  63%|██████████████████████████████████████████████▏                          | 19/30 [03:42<02:14, 12.20s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  67%|████████████████████████████████████████████████▋                        | 20/30 [03:55<02:02, 12.25s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  70%|███████████████████████████████████████████████████                      | 21/30 [04:07<01:51, 12.40s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  73%|█████████████████████████████████████████████████████▌                   | 22/30 [04:20<01:40, 12.52s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  77%|███████████████████████████████████████████████████████▉                 | 23/30 [04:32<01:27, 12.44s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  80%|██████████████████████████████████████████████████████████▍              | 24/30 [04:45<01:14, 12.49s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  83%|████████████████████████████████████████████████████████████▊            | 25/30 [04:57<01:02, 12.47s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  87%|███████████████████████████████████████████████████████████████▎         | 26/30 [05:10<00:49, 12.37s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  90%|█████████████████████████████████████████████████████████████████▋       | 27/30 [05:22<00:36, 12.33s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  93%|████████████████████████████████████████████████████████████████████▏    | 28/30 [05:34<00:24, 12.28s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing:  97%|██████████████████████████████████████████████████████████████████████▌  | 29/30 [05:47<00:12, 12.44s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Testing: 100%|█████████████████████████████████████████████████████████████████████████| 30/30 [05:59<00:00, 11.99s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
Test loss: 1.06
Test accuracy: 46.67%





In [None]:
# Initialize variables for TP, FP, TN, FN for each class
TP = [0, 0, 0]
FP = [0, 0, 0]
TN = [0, 0, 0]
FN = [0, 0, 0]

# Iterate through the test set again
outputs = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Calculating metrics"):
        x1, x2, y = batch
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        y_hat = model(x1, x2)
        outputs.append(y_hat)

        # Calculate predictions
        predictions = torch.argmax(y_hat, dim=1)

        # Update TP, FP, TN, FN counts for each class
        for i in range(3):
            TP[i] += ((predictions == i) & (y == i)).sum().item()
            FP[i] += ((predictions == i) & (y != i)).sum().item()
            TN[i] += ((predictions != i) & (y != i)).sum().item()
            FN[i] += ((predictions != i) & (y == i)).sum().item()

# Calculate metrics for each class
precision = []
recall = []
sensitivity = []
specificity = []
f1_scores = []

# Calculate F1 score for each class


for i in range(3):
    precision.append(TP[i] / (TP[i] + FP[i]))
    recall.append(TP[i] / (TP[i] + FN[i]))
    sensitivity.append(recall[i])  # sensitivity is the same as recall
    specificity.append(TN[i] / (TN[i] + FP[i]))

for i in range(3):
    if precision[i] + recall[i] == 0:
        f1_scores.append(0.0)
    else:
        f1_scores.append(2 * (precision[i] * recall[i]) / (precision[i] + recall[i]))
# Print metrics for each class
for i in range(3):
    print(f"Class {i}:")
    print(f"  Precision: {precision[i]:.2f}")
    print(f"  Recall: {recall[i]:.2f}")
    print(f"  Sensitivity: {sensitivity[i]:.2f}")
    print(f"  Specificity: {specificity[i]:.2f}")
    print(f"  F1-score: {f1_scores[i]:.2f}")


Calculating metrics:   0%|                                                                      | 0/30 [00:00<?, ?it/s]

self attention 2 done


Calculating metrics:   3%|██                                                            | 1/30 [00:12<06:14, 12.90s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:   7%|████▏                                                         | 2/30 [00:25<06:01, 12.92s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  10%|██████▏                                                       | 3/30 [00:38<05:42, 12.67s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  13%|████████▎                                                     | 4/30 [00:49<05:16, 12.19s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  17%|██████████▎                                                   | 5/30 [01:00<04:51, 11.68s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  20%|████████████▍                                                 | 6/30 [01:11<04:35, 11.46s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  23%|██████████████▍                                               | 7/30 [01:22<04:19, 11.29s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  27%|████████████████▌                                             | 8/30 [01:33<04:07, 11.23s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  30%|██████████████████▌                                           | 9/30 [01:44<03:54, 11.17s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  33%|████████████████████▎                                        | 10/30 [01:55<03:43, 11.16s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  37%|██████████████████████▎                                      | 11/30 [02:07<03:32, 11.20s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  40%|████████████████████████▍                                    | 12/30 [02:17<03:18, 11.01s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  43%|██████████████████████████▍                                  | 13/30 [02:28<03:08, 11.10s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  47%|████████████████████████████▍                                | 14/30 [02:39<02:56, 11.03s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  50%|██████████████████████████████▌                              | 15/30 [02:51<02:47, 11.17s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  53%|████████████████████████████████▌                            | 16/30 [03:02<02:36, 11.21s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  57%|██████████████████████████████████▌                          | 17/30 [03:13<02:24, 11.11s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  60%|████████████████████████████████████▌                        | 18/30 [03:23<02:11, 10.93s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  63%|██████████████████████████████████████▋                      | 19/30 [03:34<02:00, 10.92s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  67%|████████████████████████████████████████▋                    | 20/30 [03:46<01:51, 11.14s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  70%|██████████████████████████████████████████▋                  | 21/30 [03:57<01:40, 11.16s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  73%|████████████████████████████████████████████▋                | 22/30 [04:09<01:32, 11.50s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  77%|██████████████████████████████████████████████▊              | 23/30 [04:22<01:22, 11.81s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  80%|████████████████████████████████████████████████▊            | 24/30 [04:34<01:11, 11.87s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  83%|██████████████████████████████████████████████████▊          | 25/30 [04:46<00:59, 11.96s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  87%|████████████████████████████████████████████████████▊        | 26/30 [04:58<00:47, 11.90s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  90%|██████████████████████████████████████████████████████▉      | 27/30 [05:10<00:35, 11.84s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  93%|████████████████████████████████████████████████████████▉    | 28/30 [05:22<00:24, 12.02s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics:  97%|██████████████████████████████████████████████████████████▉  | 29/30 [05:34<00:11, 11.94s/it]

self attention 1 done
cross attention 1 done
self attention 3 done
self attention 2 done


Calculating metrics: 100%|█████████████████████████████████████████████████████████████| 30/30 [05:45<00:00, 11.52s/it]

self attention 1 done
cross attention 1 done
self attention 3 done





ZeroDivisionError: division by zero

In [None]:
# def train(dataloader, test_loader):


#     # Defining model and training options
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     model = MyViT(device, n_patches=15, n_blocks=1, hidden_d=512, n_heads=2, out_d=3).to(device)
#     # model.load_state_dict(torch.load('model_100.pt'))
#     N_EPOCHS = 100
#     LR = 0.001

#     # Training loop
#         # Training loop
#     optimizer = Adam(model.parameters(), lr=LR)

#     scheduler = StepLR(optimizer, step_size=20, gamma=0.1)

#     criterion = CrossEntropyLoss()
#     # criterion = nn.BCELoss()



#     train_losses = []


#     for epoch in trange(N_EPOCHS, desc="Training"):
#         train_loss = 0.0
#         for batch in tqdm(
#             dataloader, desc=f"Epoch {epoch + 1} in training", leave=False
#         ):
#             x1, x2, y = batch
#             x1, x2, y = x1.to(device), x2.to(device), y.to(device)  # Move tensors to GPU
#             # for i i

#             optimizer.zero_grad()

#             y_hat = model(x1, x2)

#             # y_one_hot = F.one_hot(y, num_classes=3).float()

#             print(y_hat, y)

#             # return y_hat
#             loss = criterion(y_hat, y)

#             train_loss += loss.detach().cpu().item() / len(dataloader)

#             # _, predicted = torch.max(y_hat.data, 1)


#             loss.backward()
#             optimizer.step()

#         print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss:.2f}")
#         train_losses.append(train_loss)

#         scheduler.step()
#         if (epoch + 1) % 10 == 0:
#             torch.save(model.state_dict(), f"model_10.pt")

#         model.eval()
#         val_loss = 0
#         correct = 0
#         total = 0
#         with torch.no_grad():
#             for inputs1, inputs2, labels in test_loader:
#                 inputs1, inputs2, labels = inputs1.to(device), inputs2.to(device), labels.to(device)
#                 outputs = model(inputs1, inputs2)
#                 loss = criterion(outputs, labels)
#                 val_loss += loss.item()
#                 _, predicted = torch.max(outputs, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()

In [None]:
outputs

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Inside your main function or after the test loop
# Assuming you have already defined the test_dataloader and the model is trained

model.eval()  # Set the model to evaluation mode
y_true = []
y_pred = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        x1, x2, y = batch
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        y_hat = model(x1, x2)
        predicted_labels = torch.argmax(y_hat, dim=1)

        y_true.extend(y.cpu().numpy())
        y_pred.extend(predicted_labels.cpu().numpy())

# Calculate precision and recall
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')

print(f'Precision: {precision}')
print(f'Recall: {recall}')

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# Fine tuning resnet 50 on the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# dataloader with features also output
import os
import torchvision.transforms as transforms
from torchvision.models import resnet50
# importing libraries
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import trange, tqdm
from torchvision import models, transforms
import torchvision.transforms.functional as TF


class oct_data(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.data = []
        self.labels = []

        classes = sorted(os.listdir(root_dir))

        for class_idx, folder in enumerate(classes):
            class_dir = os.path.join(root_dir, folder)
            for p in os.listdir(class_dir):
                path = os.path.join(class_dir, p)
                self.data.append(path)
                self.labels.append(class_idx)
                # self.max = max(len(os.listdir(path)), self.max)

        # Preprocess transforms for images
        self.preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        # Precompute and cache features during initialization

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.labels[idx]
        original_images = []
        data_path = self.data[idx]

        for path in os.listdir(data_path):
            image_path = os.path.join(data_path, path)
            img = Image.open(image_path).convert('RGB')
            img = self.preprocess(img)
            original_images.append(img)

        # original_images = [transforms.ToTensor()(img) for img in original_images]



        return original_images, label


In [None]:
dataset = oct_data('/content/drive/MyDrive/datasets/OCT/data')

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

t1 = DataLoader(train_data, batch_size=4, shuffle=True, num_workers=4)
t2 = DataLoader(test_data, batch_size=4, shuffle=True, num_workers=4)

In [None]:
num_classes = 3

model = resnet50(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
# Define the number of epochs
num_epochs = 10

# Train the model
for epoch in range(num_epochs):
    # Train the model on the training set
    model.train()
    train_loss = 0.0
    for i, (inputs, labels) in enumerate(train_loader):
        # Move the data to the device
        # inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        batch_outputs = []
        for batch in inputs:
          for images in batch:
            images = torch.stack(images).to(device)  # Stack list of images into a tensor
            outputs = model(images)
            # Aggregate outputs, e.g., by averaging
            # aggregated_output = outputs.mean(dim=0)
            # batch_outputs.append(aggregated_output)
            loss = criterion(outputs, labels)
            loss.backward()


        # batch_outputs = torch.stack(batch_outputs)

        # loss = criterion(batch_outputs, labels)
        # loss.backward()
        optimizer.step()

        # Update the training loss
        train_loss += loss.item() * inputs.size(0)

    # Evaluate the model on the test set
    model.eval()
    test_loss = 0.0
    test_acc = 0.0
    with torch.no_grad():
        for inputs_list, labels in test_loader:
            labels = labels.to(device)
            batch_outputs = []
            for images in inputs_list:
                images = torch.stack(images).to(device)
                outputs = model(images)
                aggregated_output = outputs.mean(dim=0)
                batch_outputs.append(aggregated_output)

            batch_outputs = torch.stack(batch_outputs)
            loss = criterion(batch_outputs, labels)
            test_loss += loss.item() * len(labels)
            _, preds = torch.max(batch_outputs, 1)
            test_acc += torch.sum(preds == labels.data)

    train_loss /= len(train_data)
    test_loss /= len(test_data)
    test_acc = test_acc.double() / len(test_data)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Train Loss: {train_loss:.4f} Test Loss: {test_loss:.4f} Test Acc: {test_acc:.4f}")