In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.nn.utils.rnn import pad_sequence
import pickle
from torch.nn.functional import pad, one_hot

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt

In [2]:
class MyDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Return both the feature and the corresponding label
        return self.features[idx], self.labels[idx]
    
def create_padding_mask(tensor_input):
    mask = (tensor_input.sum(dim=2) != 0).float()
    return mask
    
train_data_name = "Train_Data_100"

In [3]:
with open(train_data_name, 'rb') as file:
    train_data_set = pickle.load(file)

In [4]:
trun_data = train_data_set["System_EWS"]
trun_data.shape

(68,)

In [5]:
trun_data = train_data_set["System_EWS"]
seq_trun = [torch.from_numpy(run[:,:6]).float() for run in trun_data]

seq_padded_2 = []
max_length = 650
for run in seq_trun:
    pad_amount = max_length - run.shape[0]
    run_padded_2 = pad(run, (0, 0, pad_amount, 0))
    seq_padded_2.append(run_padded_2)

tensor_input_2 = torch.stack(seq_padded_2)
tensor_input_2.shape

torch.Size([68, 650, 6])

In [6]:
labels = [torch.tensor(label) for label in train_data_set["null"]]
labels_t = torch.stack(labels)
labels_oh = one_hot(labels_t).float()

seq = [torch.from_numpy(run).float() for run in train_data_set["System_EWS"]]
seq_padded = []
max_length = 650
for run in seq:
    pad_amount = max_length - run.shape[0]
    run_padded = pad(run, (0, 0, pad_amount, 0))
    seq_padded.append(run_padded)

tensor_input = torch.stack(seq_padded)



In [7]:
# Create dataset and dataloader
dataset = MyDataset(tensor_input_2, labels_oh)

batch_size = 17
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example: iterate over batches of (features, labels)
for batch_data, batch_labels in dataloader:
    print("Batch Features:", batch_data.shape, batch_data.dtype)
    print("Batch Labels:", batch_labels.shape, batch_labels.dtype)

Batch Features: torch.Size([17, 650, 6]) torch.float32
Batch Labels: torch.Size([17, 2]) torch.float32
Batch Features: torch.Size([17, 650, 6]) torch.float32
Batch Labels: torch.Size([17, 2]) torch.float32
Batch Features: torch.Size([17, 650, 6]) torch.float32
Batch Labels: torch.Size([17, 2]) torch.float32
Batch Features: torch.Size([17, 650, 6]) torch.float32
Batch Labels: torch.Size([17, 2]) torch.float32


In [8]:
# Define the Transformer model for classification with padding and masking
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model, num_heads, num_layers, num_classes, dropout=0.15 ):

        super(TransformerClassifier, self).__init__()
        self.model_dim = d_model
        self.batch_norm = nn.BatchNorm1d(d_model)
        self.projection = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model=d_model, nhead = num_heads, dropout = dropout, dim_feedforward=d_model,batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers, enable_nested_tensor = True)
        self.classification_head = nn.Linear(d_model, num_classes)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            # Xavier Initialization for Linear layers
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        
        elif isinstance(module, nn.TransformerEncoderLayer):
            # Kaiming initialization for Transformer encoder layers
            nn.init.kaiming_uniform_(module.self_attn.in_proj_weight, nonlinearity='relu')
            nn.init.zeros_(module.self_attn.in_proj_bias)
            nn.init.kaiming_uniform_(module.linear1.weight, nonlinearity='relu')
            nn.init.zeros_(module.linear1.bias)
            nn.init.kaiming_uniform_(module.linear2.weight, nonlinearity='relu')
            nn.init.zeros_(module.linear2.bias)

    def forward(self, src): # can add src_key_padding_mask

        src = self.projection(src) #* torch.sqrt(torch.tensor(self.model_dim, dtype=torch.float32))

        # Apply batch normalization
        # src = src.permute(0, 2, 1)  
        # src = self.batch_norm(src)
        # src = src.permute(0, 2, 1)

        src = self.pos_encoder(src)
        #src = src.permute(1,0,2)
        transformer_output = self.transformer_encoder(src,is_causal = False, src_key_padding_mask = create_padding_mask(src) ) # can add src_key_padding_mask
        # transformer_output = transformer_output.permute(1,0,2)
        # Aggregate the output of the transformer (e.g., using mean pooling or just the first token)
        pooled_output = transformer_output.mean(dim=1)
        
        # Pass through classification layer
        output = self.classification_head(pooled_output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=650):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:,:x.size(1)].requires_grad_(False)
        
        return self.dropout(x)



In [9]:
for batch_data, batch_labels in dataloader:
    # Forward pass with masking
    src_key_padding_mask = create_padding_mask(batch_data)
    print(f"mask shape: {src_key_padding_mask.shape}")  # Output will have shape: [batch_size, num_classes]
    print(f"data shape:{batch_data.shape}")
    print(f"label shape:{batch_labels.shape}")


mask shape: torch.Size([17, 650])
data shape:torch.Size([17, 650, 6])
label shape:torch.Size([17, 2])
mask shape: torch.Size([17, 650])
data shape:torch.Size([17, 650, 6])
label shape:torch.Size([17, 2])
mask shape: torch.Size([17, 650])
data shape:torch.Size([17, 650, 6])
label shape:torch.Size([17, 2])
mask shape: torch.Size([17, 650])
data shape:torch.Size([17, 650, 6])
label shape:torch.Size([17, 2])


In [10]:
# Example usage with variable-length sequences:
dim_feedforward = 64
input_dim = 6  # Number of features
d_model = 64  # Transformer model dimension
num_heads = 4   # Number of attention heads
num_layers = 6  # Number of transformer layers
num_classes = 2  # Number of classes for classification
dropout = 0.15   # Dropout rate

model = TransformerClassifier(input_dim, d_model, num_heads, num_layers, num_classes, dropout)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [11]:
for batch_data, batch_labels in dataloader:
    # Forward pass with masking
    output = model(batch_data)
    print(f"output shape: {output.shape},output: {output}")  # Output will have shape: [batch_size, num_classes]
    print(f"labels: {batch_labels}")
    # Example of using CrossEntropyLoss for training
    loss = criterion(output, batch_labels)
    print(f"Loss: {loss.item()}")
    optimizer.zero_grad()
    
    loss.backward()
    optimizer.step()
    

output shape: torch.Size([17, 2]),output: tensor([[ 0.2085,  0.0593],
        [ 0.0629, -0.1134],
        [ 0.3149,  0.1496],
        [ 0.2541,  0.1566],
        [ 0.0247, -0.0236],
        [ 0.3004,  0.2610],
        [ 0.2490,  0.2747],
        [ 0.1579,  0.2281],
        [ 0.2304,  0.0931],
        [ 0.3020,  0.3344],
        [ 0.3237,  0.2039],
        [ 0.1554, -0.0019],
        [ 0.2619,  0.1886],
        [ 0.0538, -0.0641],
        [ 0.3109,  0.2577],
        [-0.1859, -0.2020],
        [ 0.0816, -0.1800]], grad_fn=<AddmmBackward0>)
labels: tensor([[0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.]])
Loss: 0.7045390605926514
output shape: torch.Size([17, 2]),output: tensor([[-0.2760,  1.9111],
        [-0.3116,  2.1287],
        [-0.3292,