Below code comes from https://github.com/jnyborg/timematch

Trial Output

In [1]:
import torch
import numpy as np
import torch.nn as nn
import copy
# from torch.nn import TransformerEncoder, TransformerEncoderLayer
import math
import json
from torchvision.transforms import transforms
from utils import RandomSamplePixels, Normalize, ToTensor, PixelSetData,\
    split_dict_train_test, pad_sequences_collate_fn
from collections import Counter
from torch.utils.data import DataLoader

In [2]:
class LinearLayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.linear = nn.Linear(in_dim, out_dim, bias=False)
        self.norm = nn.BatchNorm1d(out_dim)
        self.activation = nn.ReLU()

    def forward(self, x):

        # x = x.permute(0, 2, 1)

        # x = (B, C) or (B, S, C)
        x = self.linear(x)  # linear expect channels last
        if x.dim() == 3:  
            # BatchNorm1d expects channels first, move to (B, C, S)
            x = self.norm(x.transpose(1, 2)).transpose(1, 2)
        else:  # (B, C)
            x = self.norm(x)
        return self.activation(x)

# pixels_tmp = pixel_dataset[1901]['pixels'].permute(0, 2, 1)
pixels_tmp = torch.randn(52, 10, 32).permute(0, 2, 1)
print("input", pixels_tmp.shape)

ll = LinearLayer(10, 64)

print(ll(pixels_tmp).shape)

input torch.Size([52, 32, 10])
torch.Size([52, 32, 64])


Δοκιμή του LinearLayer με τυχαία διανύσματα όμως χωρίς μάσκες

In [3]:
# mlp1_dim=[10, 32, 64]
mlp1_dim=[10, 32]
mlp2_dim=[64, 128]

layers = []
for i in range(len(mlp1_dim) - 1):
    layers.append(LinearLayer(mlp1_dim[i], mlp1_dim[i + 1]))
mlp1 = nn.Sequential(*layers)
print(mlp1)

layers = []
for i in range(len(mlp2_dim) - 1):
    layers.append(LinearLayer(mlp2_dim[i], mlp2_dim[i + 1]))
mlp2 = nn.Sequential(*layers)
print(mlp2)

# out = pixel_dataset[1901]['pixels'].unsqueeze(0)
out = torch.randn(52, 10, 32).unsqueeze(0)
print(out.shape) # torch.Size([1, 52, 10, 32])

batch, temp = out.shape[:2]
# print(batch, temp) # 1 52

out = out.view(batch * temp, *out.shape[2:]).transpose(1, 2)  # (B*T, S, C)
print(out.shape) # torch.Size([52, 32, 10])

out = mlp1(out).transpose(1, 2)
print(out.shape) # torch.Size([52, 32, 32])

out = torch.cat(
            [out.mean(dim=-1), out.std(dim=-1)],
            dim=1)
print(out.shape) # torch.Size([52, 64])

out = mlp2(out)
print(out.shape) # torch.Size([52, 128])

out = out.view(batch, temp, -1)
print(out.shape) # torch.Size([1, 52, 128])

Sequential(
  (0): LinearLayer(
    (linear): Linear(in_features=10, out_features=32, bias=False)
    (norm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation): ReLU()
  )
)
Sequential(
  (0): LinearLayer(
    (linear): Linear(in_features=64, out_features=128, bias=False)
    (norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation): ReLU()
  )
)
torch.Size([1, 52, 10, 32])
torch.Size([52, 32, 10])
torch.Size([52, 32, 32])
torch.Size([52, 64])
torch.Size([52, 128])
torch.Size([1, 52, 128])


In [4]:
def masked_mean(x, mask):
    print("input of masked_mean unique", x.unique())
    print("input of masked_mean shape", x.shape)
    print("mask of masked_mean shape", mask.shape)
    print("mask of masked_mean uniques", mask.unique())

    # input of masked_mean shape torch.Size([416, 32, 32])
    # mask of masked_mean shape torch.Size([416, 32])
    
    out = x.permute((1, 0, 2)) # torch.Size([32, 416, 32])
    
    out = out * mask # torch.Size([32, 416, 32])
    # here every pixel gets to be zero
    print("out = out * mask shape", out.shape)
    print("uniques of out = out * mask", out.unique())
    
    out = out.sum(dim=-1) / mask.sum(dim=-1) # torch.Size([32, 416])
    print("out = out.sum(dim=-1) / mask.sum(dim=-1) shape", out.shape)
    print("uniques here", out.unique())
    
    out = out.permute((1, 0))
    print("out of masked_mean unique", out.unique())
    print()
    return out

def masked_std(x, mask):
    print("input of masked_std unique", x.unique())
    m = masked_mean(x, mask)

    out = x.permute((2, 0, 1))
    out = out - m
    out = out.permute((2, 1, 0))

    out = out * mask
    d = mask.sum(dim=-1)
    d[d == 1] = 2

    out = (out ** 2).sum(dim=-1) / (d - 1)
    out = torch.sqrt(out + 10e-32) # To ensure differentiability
    out = out.permute(1, 0)
    print("out of masked_std unique", out.unique())
    print()
    return out

pooling_methods = {
    "mean": masked_mean,
    "std": masked_std,
}

Οι extra χωρικές πληροφορίες δε χρειάζονται

In [5]:
class PixelSetEncoder(nn.Module):
    def __init__(
        self,
        input_dim,
        mlp1=[10, 32, 64],
        pooling="mean_std",
        mlp2=[64, 128],
    ):
        """
        Pixel-set encoder.
        Args:
            input_dim (int): Number of channels of the input tensors
            mlp1 (list):  Dimensions of the successive feature spaces of MLP1
            pooling (str): Pixel-embedding pooling strategy, can be chosen in ('mean','std','max,'min')
                or any underscore-separated combination thereof.
            mlp2 (list): Dimensions of the successive feature spaces of MLP2
            with_extra (bool): Whether additional pre-computed features are passed between the two MLPs
            extra_size (int, optional): Number of channels of the additional features, if any.
        """

        super(PixelSetEncoder, self).__init__()

        self.input_dim = input_dim
        self.mlp1_dim = copy.deepcopy(mlp1)
        self.mlp2_dim = copy.deepcopy(mlp2)
        self.pooling = pooling

        self.output_dim = (
            input_dim * len(pooling.split("_"))
            if len(self.mlp2_dim) == 0
            else self.mlp2_dim[-1]
        )

        # inter_dim = self.mlp1_dim[-1] * len(pooling.split("_"))
        # if self.with_extra:
        #     inter_dim += self.extra_size
        # assert input_dim == mlp1[0]
        # assert inter_dim == mlp2[0]

        # Feature extraction
        layers = []
        for i in range(len(self.mlp1_dim) - 1):
            layers.append(LinearLayer(self.mlp1_dim[i], self.mlp1_dim[i + 1]))
        self.mlp1 = nn.Sequential(*layers)

        # MLP after pooling
        layers = []
        for i in range(len(self.mlp2_dim) - 1):
            layers.append(LinearLayer(self.mlp2_dim[i], self.mlp2_dim[i + 1]))
        self.mlp2 = nn.Sequential(*layers)

    def forward(self, pixels,
                mask
                ):
        """
        The input of the PSE is a tuple of tensors as yielded by the PixelSetData class:
          (Pixel-Set, Pixel-Mask) or ((Pixel-Set, Pixel-Mask), Extra-features)
        Pixel-Set : Batch_size x (Sequence length) x Channel x Number of pixels
        Pixel-Mask : Batch_size x (Sequence length) x Number of pixels
        Extra-features : Batch_size x (Sequence length) x Number of features

        If the input tensors have a temporal dimension, it will be combined with the batch dimension so that the
        complete sequences are processed at once. Then the temporal dimension is separated back to produce a tensor of
        shape Batch_size x Sequence length x Embedding dimension
        """
        out = pixels

        batch, temp = out.shape[:2]

        out = out.view(batch * temp, *out.shape[2:]).transpose(1, 2)  # (B*T, S, C)
        
        mask = mask.view(batch * temp, -1)

        out = self.mlp1(out).transpose(1, 2)

        print("in pse out mask uniques", mask.unique())
        print("in pse out mask uniques type", type(mask.unique()))
        
        # if not torch.equal(mask.unique(), torch.tensor([False])):
        out = torch.cat(
            [pooling_methods[n](out, mask) for n in self.pooling.split("_")], dim=1
        )
        # else:
        #     out = torch.cat(
        #         [out.mean(dim=-1), out.std(dim=-1)],
        #         dim=1
        #     )

        out = self.mlp2(out)
        out = out.view(batch, temp, -1)
        return out

Είσοδος και έξοδος PSE πριν προστεθεί η μάσκα

In [6]:
# # input = pixel_dataset[1911]['pixels'].unsqueeze(0)
# input = torch.randn(52, 10, 32).unsqueeze(0)
# print(input.shape)
# out = pixel_set_encoder(input)
# print(out.shape) # torch.Size([1, 52, 128])

Δημιουργία Dataset και Dataloaders για τη δοκιμή του PSE

In [7]:
# labels_200 will contain the labels with more than 200 occurrences
f_labels = open(r"Exercise4\timematch_data\denmark\32VNH\2017\meta\labels_cleaned.json")
labels_200 = json.load(f_labels)

print(len(labels_200))
print(len(set(labels_200.values())))

count = 0
for lab in labels_200:
    print(lab, labels_200[lab])
    count += 1
    if count == 7:
        break

labels_200_counter = Counter(labels_200.values())

print()

for lab in labels_200_counter:
    print(lab, labels_200_counter[lab])

4255
7
0 corn
1 corn
2 corn
3 corn
4 corn
5 spring_barley
6 corn

corn 275
spring_barley 1141
meadow 1013
winter_wheat 856
winter_rapeseed 301
winter_barley 352
winter_rye 317


Οι παρακάτω αντιστοιχίσεις είναι διαφορετικές από του dataset_creation αλλά δεν πειράζει

In [8]:
class_to_idx = {cls: idx for idx, cls in enumerate(labels_200_counter)}
for key, val in class_to_idx.items():
    print(key, val)

print()

train_labels, val_labels = split_dict_train_test(labels_200, test_size=0.2)

print(len(labels_200))
print(len(train_labels))
print(len(val_labels))

corn 0
spring_barley 1
meadow 2
winter_wheat 3
winter_rapeseed 4
winter_barley 5
winter_rye 6

4255
3400
855


In [9]:
train_transform = transforms.Compose([RandomSamplePixels(32), Normalize(), ToTensor()])

test_transform = transforms.Compose([Normalize(), ToTensor()])

train_dataset = PixelSetData("Exercise4/timematch_data/denmark/32VNH/2017",
                             class_to_idx,
                             train_labels,
                             train_transform)
print(len(train_dataset))
print()
val_dataset = PixelSetData("Exercise4/timematch_data/denmark/32VNH/2017",
                             class_to_idx,
                             val_labels,
                             test_transform)
print(len(val_dataset))

('Exercise4/timematch_data/denmark/32VNH/2017\\data\\0.zarr', '0', 'corn')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1.zarr', '1', 'corn')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\10.zarr', '10', 'corn')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\100.zarr', '100', 'winter_rapeseed')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1000.zarr', '1000', 'spring_barley')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1001.zarr', '1001', 'winter_rye')
3400

('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1003.zarr', '1003', 'winter_barley')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1009.zarr', '1009', 'winter_barley')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1014.zarr', '1014', 'meadow')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1017.zarr', '1017', 'meadow')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1019.zarr', '1019', 'meadow')
('Exercise4/timematch_data/denmark/32VNH/2017\\data\\1020.zarr', '1020'

In [10]:
train_dloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=pad_sequences_collate_fn,
    # num_workers=4,
)
val_dloader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=pad_sequences_collate_fn,
)

print(len(train_dloader))
print(len(val_dloader))

for batch in train_dloader:
    print(batch[0].shape)
    print(batch[1].shape)
    print(batch[1])
    print(batch[2].shape)
    print(np.unique(batch[2]))
    print()
    break

for batch in val_dloader:
    print(batch[0].shape)
    print(batch[1].shape)
    print(batch[1])
    print(batch[2].shape)
    print(np.unique(batch[2]))
    print()
    break

425
107
torch.Size([8, 52, 10, 32])
torch.Size([8])
tensor([2, 1, 5, 1, 1, 5, 2, 3])
torch.Size([8, 52, 32])
[False]

torch.Size([8, 52, 10, 2264])
torch.Size([8])
tensor([1, 1, 1, 3, 4, 6, 4, 2])
torch.Size([8, 52, 2264])
[False  True]



In [11]:
pixel_set_encoder = PixelSetEncoder(10,
                                    mlp1=[10, 32],
                                    pooling="mean_std",
                                    mlp2=[64, 128])
# print(pixel_set_encoder)
print()

for batch in train_dloader:
    input_tensor = batch[0]
    labels = batch[1]
    mask = batch[2]

    print(input_tensor.shape)
    print(labels.shape)
    print(labels)
    print(mask.shape)
    print()

    break

out = pixel_set_encoder(input_tensor, mask)

print(out.shape)


torch.Size([8, 52, 10, 32])
torch.Size([8])
tensor([6, 1, 2, 4, 2, 6, 3, 2])
torch.Size([8, 52, 32])

in pse out mask uniques tensor([False])
in pse out mask uniques type <class 'torch.Tensor'>
input of masked_mean unique tensor([0.0000e+00, 2.3799e-06, 3.4309e-06,  ..., 5.2826e+00, 5.2985e+00,
        5.3041e+00], grad_fn=<Unique2Backward0>)
input of masked_mean shape torch.Size([416, 32, 32])
mask of masked_mean shape torch.Size([416, 32])
mask of masked_mean uniques tensor([False])
out = out * mask shape torch.Size([32, 416, 32])
uniques of out = out * mask tensor([0.], grad_fn=<Unique2Backward0>)
out = out.sum(dim=-1) / mask.sum(dim=-1) shape torch.Size([32, 416])
uniques here tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<Unique2Backward0>)
out of masked_mean unique tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<Unique2Backward0>)

input of masked_std unique tensor([0.0000e+00, 2.3799e-06, 3.4309e-06,  ..., 5.2826e+00, 5.2985e+00,
        5.3041e+00], grad_fn=<Uniqu

In [12]:
print(np.unique(out.detach().numpy()))
print(out)

[nan]
tensor([[[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         ...,
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan, nan]],

        ...,

        [[nan, nan, nan,  ..., nan, nan, nan],
         [nan, nan, nan,  ..., nan, nan

Without collate_fn=pad_sequences_collate_fn

In [13]:
train_dloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    # collate_fn=pad_sequences_collate_fn,
    # num_workers=4,
)

print(len(train_dloader))

for batch in train_dloader:
    input_tensor = batch["pixels"]
    labels = batch["label_idx"]
    mask = batch["valid_pixels"]

    print(input_tensor.shape)
    print(labels.shape)
    print(labels)
    print(mask.shape)
    print()

    break

out = pixel_set_encoder(input_tensor, mask)

print(out.shape)

425


torch.Size([8, 52, 10, 32])
torch.Size([8])
tensor([5, 1, 1, 1, 3, 1, 2, 5])
torch.Size([8, 52, 32])

in pse out mask uniques tensor([1.])
in pse out mask uniques type <class 'torch.Tensor'>
input of masked_mean unique tensor([0.0000e+00, 4.8964e-06, 2.7331e-05,  ..., 5.0042e+00, 5.0644e+00,
        5.4251e+00], grad_fn=<Unique2Backward0>)
input of masked_mean shape torch.Size([416, 32, 32])
mask of masked_mean shape torch.Size([416, 32])
mask of masked_mean uniques tensor([1.])
out = out * mask shape torch.Size([32, 416, 32])
uniques of out = out * mask tensor([0.0000e+00, 4.8964e-06, 2.7331e-05,  ..., 5.0042e+00, 5.0644e+00,
        5.4251e+00], grad_fn=<Unique2Backward0>)
out = out.sum(dim=-1) / mask.sum(dim=-1) shape torch.Size([32, 416])
uniques here tensor([0.0000e+00, 3.1566e-06, 2.2610e-05,  ..., 3.6695e+00, 3.6740e+00,
        3.9598e+00], grad_fn=<Unique2Backward0>)
out of masked_mean unique tensor([0.0000e+00, 3.1566e-06, 2.2610e-05,  ..., 3.6695e+00, 3.6740e+00,
        3.9

In [14]:
print(np.unique(out.detach().numpy()))
print(out)

[0.0000000e+00 5.2034855e-05 7.1287155e-05 ... 5.2809896e+00 5.4021277e+00
 5.7293777e+00]
tensor([[[0.0000, 0.0000, 0.0000,  ..., 1.2681, 0.4686, 0.0000],
         [0.0000, 0.0000, 0.0000,  ..., 0.5516, 0.7058, 0.0000],
         [2.9054, 0.5577, 3.4015,  ..., 0.0000, 0.0000, 3.1386],
         ...,
         [1.3263, 0.6960, 2.2402,  ..., 0.0000, 0.0000, 2.0096],
         [0.4801, 0.7719, 0.7552,  ..., 0.0000, 0.2659, 0.9189],
         [0.0000, 0.1298, 0.0000,  ..., 1.2440, 0.3603, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 1.1416, 0.7593, 0.0000],
         [1.4487, 0.0000, 1.1900,  ..., 0.0000, 0.0000, 1.2956],
         [0.0000, 0.0000, 0.0000,  ..., 0.5114, 0.5966, 0.0000],
         ...,
         [0.0000, 0.0000, 0.0000,  ..., 1.1089, 0.6143, 0.0000],
         [0.4578, 0.4223, 0.6176,  ..., 0.0000, 0.0000, 0.7925],
         [0.0000, 0.0000, 0.0000,  ..., 0.6478, 0.9148, 0.0000]],

        [[0.0000, 0.0000, 0.0000,  ..., 1.2068, 0.5327, 0.0000],
         [0.0000, 0.0000, 0.5610

In [15]:
# val_dloader = DataLoader(
#     val_dataset,
#     batch_size=8,
#     shuffle=True,
#     # collate_fn=pad_sequences_collate_fn,
#     # num_workers=4,
# )

# print(len(val_dloader))

# for batch in val_dloader:
#     input_tensor = batch["pixels"]
#     labels = batch["label_idx"]
#     mask = batch["valid_pixels"]

#     print(input_tensor.shape)
#     print(labels.shape)
#     print(labels)
#     print(mask.shape)
#     print()

#     break

# out = pixel_set_encoder(input_tensor, mask)

# print(out.shape)

In [16]:
# aaa

## Transformer Part

In [17]:
# can also check Tsironis

class PositionalEncoding(nn.Module):
    def __init__(self,
                 d_model: int,
                #  dropout: float = 0.1,
                #  max_len: int = 5000
                 max_len: int = 52
                 ):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
            )
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x


pe = PositionalEncoding(128)

input = torch.randn(1, 52, 128)
out = pe(input)

print(out.shape)

torch.Size([1, 52, 128])


In [18]:
# class TimeSeriesTransformer(nn.Module):
#     def __init__(self,
#                  feature_size,
#                  num_classes: int = 7,
#                  num_layers=3, num_heads=8,
#                 #  hidden_dim=512, dropout=0.1
#                  ):
#         super(TimeSeriesTransformer, self).__init__()
        
#         self.pos_encoder = PositionalEncoding(feature_size)
#         self.encoder_layers = TransformerEncoderLayer(
#             d_model=feature_size, nhead=num_heads,
#             # dim_feedforward=hidden_dim,
#             # dropout=dropout,
#             batch_first=True
#             )
#         self.transformer_encoder = TransformerEncoder(self.encoder_layers, num_layers)
        
#         self.classification_token = nn.Parameter(torch.zeros(1, 1, feature_size),
#                                                  requires_grad=True
#                                                  )
#         self.fc = nn.Linear(feature_size, num_classes)  # Adjust the output dimension as needed
        
#     def forward(self,
#                 x: torch.Tensor,
#                 ) -> torch.Tensor:
#         """
#         Arguments:
#             x: Tensor, shape ``[batch_size, seq_len, embedding_dim]``
#         """
#         batch_size, seq_len, feature_size = x.size()
        
#         x = self.pos_encoder(x)  # BS x T x d_model

#         # Add classification token
#         cls_tokens = self.classification_token.expand(batch_size, -1, -1)

#         x = torch.cat((cls_tokens, x), dim=1)  # [batch_size, seq_len + 1, feature_size]
        
#         # # Add positional encoding and permute for Transformer [seq_len + 1, batch_size, feature_size]
#         # x = self.pos_encoder(x.permute(1, 0, 2))

#         x = self.transformer_encoder(x)  # Pass through the transformer encoder
#         # x = x[0, :, :]  # Extract the classification token output

#         # from Tsironis
#         # x = x[:, 0, :]  # BS x d_model
        
#         x = self.fc(x)  # Final classification layer
#         # return x.squeeze()  # Adjust based on the output needs
#         return x

class TimeSeriesTransformer(nn.Module):
    def __init__(self,
                #  num_classes: int,
                 d_model: int = 128):
        super().__init__()

        self.encoder = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(
                d_model=d_model, nhead=8, batch_first=True
            ),
            num_layers=3,
        )

        self.cls_tkn = nn.Parameter(torch.rand(1, 1, d_model), requires_grad=True)

        self.pos_emb = PositionalEncoding(d_model)

        # self.classifier = nn.Sequential(
        #     nn.Linear(d_model, 64),
        #     nn.ReLU(),
        #     nn.Linear(64, num_classes),
        # )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        # BS x T x 512
        x = self.pos_emb(x)  # BS x T x d_model

        cls_tkn = self.cls_tkn.expand(x.shape[0], -1, -1)  # BS x 1 x d_model
        x = torch.cat([cls_tkn, x], dim=1)  # BS x (T+1) x d_model

        x = self.encoder(x)  # BS x (T+1) x d_model

        x = x[:, 0, :]  # BS x d_model
        
        # return self.classifier(x)
        return x


# time_series_transformer = TimeSeriesTransformer(feature_size=128)
time_series_transformer = TimeSeriesTransformer(
    # num_classes=7,
    d_model=128)
# print(time_series_transformer)

input = torch.randn(1, 52, 128)
out = time_series_transformer(input)
print("Output shape: ", out.shape)

Output shape:  torch.Size([1, 128])


In [19]:
class SimpleMLP(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim: int = 7):
        super(SimpleMLP, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim),
        )
    
    def forward(self, x):
        x = self.fc(x)
        return x

simple_mlp = SimpleMLP(input_dim=128, output_dim=7)
# print(simple_mlp)

# input = torch.randn(1, 128)
input = out
out = simple_mlp(input)

print(out.shape)

print(out)

torch.Size([1, 7])
tensor([[-0.2005,  0.1235, -0.0537, -0.1899, -0.0502,  0.0364,  0.1819]],
       grad_fn=<AddmmBackward0>)


In [20]:
# class CompleteModel(nn.Module):
#     def __init__(self, pixel_encoder, transformer_encoder):
#         super(CompleteModel, self).__init__()
#         self.pixel_encoder = pixel_encoder
#         self.transformer_encoder = transformer_encoder
    
#     def forward(self, x):
#         x = self.pixel_encoder(x)  # Shape: [batch_size, seq_len, feature_size]
#         x = self.transformer_encoder(x)  # Shape: [batch_size]
#         return x

# input_tensor = torch.randn(1, 52, 10, 32)  # Example input tensor

# # pixel_encoder = PixelSetEncoder(input_dim=32, hidden_dim=64, output_dim=128)
pixel_encoder = PixelSetEncoder(10,
                                mlp1=[10, 32],
                                pooling="mean_std",
                                mlp2=[64, 128])
# transformer_encoder = TimeSeriesTransformer(feature_size=128)
transformer_encoder = TimeSeriesTransformer(d_model=128)

simple_mlp = SimpleMLP(input_dim=128, output_dim=7)

# model = CompleteModel(pixel_encoder, transformer_encoder)

# output = model(input_tensor)
# print(output.shape)  # Adjust based on the final classification layer output dimension

# print(output)

In [22]:
train_dloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=pad_sequences_collate_fn,
    # num_workers=4,
)

for batch in train_dloader:
    input_tensor = batch[0]
    labels = batch[1]
    mask = batch[2]

    print(input_tensor.shape)
    print(labels.shape)
    print(labels)
    print(mask.shape)
    print()

    break

out = pixel_encoder(input_tensor, mask)

print(out.shape)

out = transformer_encoder(out)

print(out.shape)

# print(out)

out = simple_mlp(out)

print(out.shape)

print(out)

torch.Size([8, 52, 10, 32])
torch.Size([8])
tensor([3, 1, 3, 2, 1, 1, 3, 3])
torch.Size([8, 52, 32])

in pse out mask uniques tensor([False])
in pse out mask uniques type <class 'torch.Tensor'>
input of masked_mean unique tensor([0.0000e+00, 1.1681e-06, 7.1629e-06,  ..., 5.2140e+00, 5.2468e+00,
        5.3344e+00], grad_fn=<Unique2Backward0>)
input of masked_mean shape torch.Size([416, 32, 32])
mask of masked_mean shape torch.Size([416, 32])
mask of masked_mean uniques tensor([False])
out = out * mask shape torch.Size([32, 416, 32])
uniques of out = out * mask tensor([0.], grad_fn=<Unique2Backward0>)
out = out.sum(dim=-1) / mask.sum(dim=-1) shape torch.Size([32, 416])
uniques here tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<Unique2Backward0>)
out of masked_mean unique tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<Unique2Backward0>)

input of masked_std unique tensor([0.0000e+00, 1.1681e-06, 7.1629e-06,  ..., 5.2140e+00, 5.2468e+00,
        5.3344e+00], grad_fn=<Unique

In [23]:
for batch in val_dloader:
    input_tensor = batch[0]
    labels = batch[1]
    mask = batch[2]

    print(input_tensor.shape)
    print(labels.shape)
    print(labels)
    print(mask.shape)
    print()

    break

out = pixel_encoder(input_tensor, mask)

print(out.shape)

out = transformer_encoder(out)

print(out.shape)

# print(out)

out = simple_mlp(out)

print(out.shape)

print(out)

torch.Size([8, 52, 10, 2830])
torch.Size([8])
tensor([6, 4, 4, 6, 3, 2, 2, 3])
torch.Size([8, 52, 2830])

in pse out mask uniques tensor([False,  True])
in pse out mask uniques type <class 'torch.Tensor'>
input of masked_mean unique tensor([0.0000e+00, 1.3078e-07, 5.4485e-07,  ..., 1.0073e+01, 1.0092e+01,
        1.0109e+01], grad_fn=<Unique2Backward0>)
input of masked_mean shape torch.Size([416, 32, 2830])
mask of masked_mean shape torch.Size([416, 2830])
mask of masked_mean uniques tensor([False,  True])
out = out * mask shape torch.Size([32, 416, 2830])
uniques of out = out * mask tensor([0.0000, 0.0134, 0.0611, 0.2291, 0.2377, 0.2890, 0.3258, 0.4007, 0.4104,
        0.4353, 0.4621, 0.4664, 0.4667, 0.4691, 0.4722, 0.4752, 0.4775],
       grad_fn=<Unique2Backward0>)
out = out.sum(dim=-1) / mask.sum(dim=-1) shape torch.Size([32, 416])
uniques here tensor([0.0000, 0.0134, 0.0134,  ...,    nan,    nan,    nan],
       grad_fn=<Unique2Backward0>)
out of masked_mean unique tensor([0.0000,