# Generate Modbus TCP packet Using Transformer Model

##[Data Link](https://drive.google.com/file/d/1L3QYDOLdWSPRy6nCSM2-Dl_A9mDKkwU-/view?usp=sharing)

### Connect To Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install Libraries

In [None]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->torchmetrics)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Co

###Import Libraries

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import math
import torchtext.datasets as datasets
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR

import warnings
from tqdm import tqdm
import os
from pathlib import Path

import torchmetrics
from torch.utils.tensorboard import SummaryWriter



### Read & Explore Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/data_final.csv')
df.head()

In [None]:
#df = df[:1000]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 609934 entries, 0 to 609933
Data columns (total 19 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       609934 non-null  object
 1   1       609934 non-null  object
 2   2       609934 non-null  object
 3   3       609934 non-null  object
 4   4       609934 non-null  object
 5   5       609934 non-null  object
 6   6       609934 non-null  object
 7   7       609934 non-null  object
 8   8       609934 non-null  object
 9   9       609934 non-null  object
 10  10      609934 non-null  object
 11  11      609934 non-null  object
 12  12      152741 non-null  object
 13  13      152741 non-null  object
 14  14      76203 non-null   object
 15  15      76203 non-null   object
 16  16      76203 non-null   object
 17  17      76203 non-null   object
 18  18      76203 non-null   object
dtypes: object(19)
memory usage: 88.4+ MB


### Define Dataloader

In [None]:
class CustomDataset(Dataset):

    def __init__(self, df, seq_len):
        super().__init__()
        self.seq_len = seq_len
        self.df = df
        self.sos_token = torch.tensor([256], dtype=torch.int64)
        self.eos_token = torch.tensor([257], dtype=torch.int64)
        self.pad_token = torch.tensor([258], dtype=torch.int64)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_target_pair = self.df[idx]

        enc_input_tokens = src_target_pair[:8]
        dec_input_tokens = src_target_pair[8:]

        # Add sos, eos and padding to each sentence
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # We will add <s> and </s>
        # We will only add <s>, and </s> only on the label
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1

        # Make sure the number of padding tokens is not negative. If it is, the sentence is too long
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError("Sentence is too long")

        # Add <s> and </s> token
        encoder_input = torch.tensor(enc_input_tokens, dtype=torch.int64)

        # Add only <s> token
        decoder_input = torch.cat(
            [
                self.sos_token,
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Add only </s> token
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype=torch.int64),
                self.eos_token,
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,
        )

        # Double check the size of the tensors to make sure they are all seq_len long
        #assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len

        return {
            "encoder_input": encoder_input,  # (seq_len)
            "decoder_input": decoder_input,  # (seq_len)
            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, seq_len)
            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).int() & causal_mask(decoder_input.size(0)), # (1, seq_len) & (1, seq_len, seq_len),
            "label": label,  # (seq_len)
        }

def causal_mask(size):
    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int)
    return mask == 0

### Define Transformer model


<img src='https://drive.google.com/uc?id=1N-GpbPpBJd4ZQ-YeK2C-bPLUvkrq4gpr' height=600px width=400px>

In [None]:
class LayerNormalization(nn.Module):

    def __init__(self, features: int, eps:float=10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features)) # alpha is a learnable parameter
        self.bias = nn.Parameter(torch.zeros(features)) # bias is a learnable parameter

    def forward(self, x):
        # x: (batch, seq_len, hidden_size)
         # Keep the dimension for broadcasting
        mean = x.mean(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # Keep the dimension for broadcasting
        std = x.std(dim = -1, keepdim = True) # (batch, seq_len, 1)
        # eps is to prevent dividing by zero or when std is very small
        return self.alpha * (x - mean) / (std + self.eps) + self.bias

class FeedForwardBlock(nn.Module):

    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # w1 and b1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # w2 and b2

    def forward(self, x):
        # (batch, seq_len, d_model) --> (batch, seq_len, d_ff) --> (batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class InputEmbeddings(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        # (batch, seq_len) --> (batch, seq_len, d_model)
        # Multiply by sqrt(d_model) to scale the embeddings according to the paper
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)
        # Create a matrix of shape (seq_len, d_model)
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape (seq_len)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        # Create a vector of shape (d_model)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        # Apply sine to even indices
        pe[:, 0::2] = torch.sin(position * div_term) # sin(position * (10000 ** (2i / d_model))
        # Apply cosine to odd indices
        pe[:, 1::2] = torch.cos(position * div_term) # cos(position * (10000 ** (2i / d_model))
        # Add a batch dimension to the positional encoding
        pe = pe.unsqueeze(0) # (1, seq_len, d_model)
        # Register the positional encoding as a buffer
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False) # (batch, seq_len, d_model)
        return self.dropout(x)

class ResidualConnection(nn.Module):

        def __init__(self, features: int, dropout: float) -> None:
            super().__init__()
            self.dropout = nn.Dropout(dropout)
            self.norm = LayerNormalization(features)

        def forward(self, x, sublayer):
            return x + self.dropout(sublayer(self.norm(x)))

class MultiHeadAttentionBlock(nn.Module):

    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model # Embedding vector size
        self.h = h # Number of heads
        # Make sure d_model is divisible by h
        assert d_model % h == 0, "d_model is not divisible by h"

        self.d_k = d_model // h # Dimension of vector seen by each head
        self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
        self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
        self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
        self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        # Just apply the formula from the paper
        # (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            # Write a very low value (indicating -inf) to the positions where mask == 0
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
        if dropout is not None:
            attention_scores = dropout(attention_scores)
        # (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
        # return attention scores which can be used for visualization
        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)

        # (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        # Calculate attention
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine all the heads together
        # (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # Multiply by Wo
        # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
        return self.w_o(x)

class EncoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class Encoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class DecoderBlock(nn.Module):

    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(3)])

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

class Decoder(nn.Module):

    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

class ProjectionLayer(nn.Module):

    def __init__(self, d_model, vocab_size) -> None:
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x) -> None:
        # (batch, seq_len, d_model) --> (batch, seq_len, vocab_size)
        return self.proj(x)

class Transformer(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer

    def encode(self, src, src_mask):
        # (batch, seq_len, d_model)
        src = self.src_embed(src)
        src = self.src_pos(src)
        return self.encoder(src, src_mask)

    def decode(self, encoder_output: torch.Tensor, src_mask: torch.Tensor, tgt: torch.Tensor, tgt_mask: torch.Tensor):
        # (batch, seq_len, d_model)
        tgt = self.tgt_embed(tgt)
        tgt = self.tgt_pos(tgt)
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)

    def project(self, x):
        # (batch, seq_len, vocab_size)
        return self.projection_layer(x)

def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int=512, N: int=6, h: int=8, dropout: float=0.1, d_ff: int=2048) -> Transformer:
    # Create the embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size)

    # Create the positional encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout)
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout)

    # Create the encoder blocks
    encoder_blocks = []
    for _ in range(N):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block)

    # Create the decoder blocks
    decoder_blocks = []
    for _ in range(N):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout)
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(d_model, decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block)

    # Create the encoder and decoder
    encoder = Encoder(d_model, nn.ModuleList(encoder_blocks))
    decoder = Decoder(d_model, nn.ModuleList(decoder_blocks))

    # Create the projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size)

    # Create the transformer
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)

    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return transformer

### Define Load Dataset Utilities

In [None]:
def hex2dec(hex_str):
        return int(hex_str, 0)

def compine_df_in_one_row(df):
    df = df.applymap(lambda x: hex2dec(x) if isinstance(x, str) and x.startswith('0x') else x)
    data = []
    for i in range(len(df)):
        row = []
        for item in df.iloc[i, :]:
            if item != -1:
                row.append(item)

        data.append(row)
    return data

In [None]:
def get_ds(df, seq_len, batch_size):
    # Keep 90% for training, 10% for validation
    df_full = df.fillna(-1)
    df_full = compine_df_in_one_row(df_full)
    train_ds_size = int(0.99 * len(df_full))
    print(f"train size = {train_ds_size}")
    val_ds_size = len(df_full) - int(0.99 * len(df_full))
    train_df, val_df = random_split(df_full, [train_ds_size, val_ds_size])
    #train_df = df_full
    train_ds = CustomDataset(train_df, seq_len)
    val_ds = CustomDataset(val_df, seq_len)

    train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)

    return train_dataloader, val_dataloader, seq_len, seq_len

### Transformer Training Process

In [None]:
def masked_accuracy(prediction, target, mask):
    # Ensure prediction are the same shape as target and mask

    if prediction.shape == target.shape:

        # Apply the mask to filter out invalid elements
        masked_prediction = prediction[mask == 1]
        masked_target = target[mask == 1]

        # Calculate accuracy
        correct_prediction = (masked_prediction.cpu() == masked_target.cpu()).sum().item()
        total_valid_elements = mask.sum().item()

        accuracy = correct_prediction / total_valid_elements
        return accuracy
    else:
        return 0

In [None]:
def accuracy(predictions, targets, masks):
    sum = 0
    for prediction, target, mask in zip(predictions, targets, masks):
        sum += masked_accuracy(prediction, target, mask)
    return sum / len(predictions)

### Define Model Training Utilities

In [None]:
def get_model(src_len, tgt_len, seq_len, d_model):
    model = build_transformer(src_len, tgt_len, seq_len, seq_len, d_model=d_model)
    return model

In [None]:
def greedy_decode(model, source, source_mask, max_len, device):
    sos_idx = 256
    eos_idx = 257

    # Precompute the encoder output and reuse it for every step
    encoder_output = model.encode(source, source_mask)
    # Initialize the decoder input with the sos token
    decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
    while True:
        if decoder_input.size(1) == max_len:
            break

        # build mask for target
        decoder_mask = causal_mask(decoder_input.size(1)).type_as(source_mask).to(device)

        # calculate output
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)

        # get next token
        prob = model.project(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat(
            [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1
        )

        if next_word == eos_idx:
            break

    return decoder_input.squeeze(0)

In [None]:
def run_validation(model, validation_ds, max_len, device, print_msg, global_step, writer, training_accuracy, num_examples=5):
    model.eval()
    pad = 258
    count = 0
    source_texts = []
    expected = []
    predicted = []
    targets = []
    predictions = []
    masks = []
    try:
        # get the console window width
        with os.popen('stty size', 'r') as console:
            _, console_width = console.read().split()
            console_width = int(console_width)
    except:
        # If we can't get the console width, use 80 as default
        console_width = 80

    with torch.no_grad():
        for batch in validation_ds:
            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)
            targets += batch['label']
            predictions += proj_output.argmax(dim=2)
            masks += batch['label'] != pad
            count += 1
            encoder_input = batch["encoder_input"].to(device) # (b, seq_len)
            encoder_mask = batch["encoder_mask"].to(device) # (b, 1, 1, seq_len)

            # check that the batch size is 1
            assert encoder_input.size(
                0) == 1, "Batch size must be 1 for validation"

            model_out = greedy_decode(model, encoder_input, encoder_mask, max_len, device)
            source = [str(int(x)) for x in batch["encoder_input"][0]]
            target = [str(int(x)) for x in batch["label"][0]]
            index = target.index("257")
            target = target[:index + 1]
            model_out = [str(int(x)) for x in model_out.detach().cpu().numpy()]
            model_out = model_out[1:]
            source_text = " ".join(source)
            target_text = " ".join(target)
            model_out_text = " ".join(model_out)

            source_texts.append(source_text)
            expected.append(target_text)
            predicted.append(model_out_text)



            if count <= num_examples:
                # Print the source, target and model output
                print_msg('-'*console_width)
                print_msg(f"{f'SOURCE: ':>12}{source_text}")
                print_msg(f"{f'TARGET: ':>12}{target_text}")
                print_msg(f"{f'PREDICTED: ':>12}{model_out_text}")
                print_msg('-'*console_width)

    print(f'training accurcy: {training_accuracy}')
    print(f'validation accurcy: {accuracy(predictions, targets, masks)}')
    if writer:
        # Evaluate the character error rate
        # Compute the char error rate
        metric = torchmetrics.CharErrorRate()
        cer = metric(predicted, expected)
        print(f"CER: {cer:.3f}")
        writer.add_scalar('validation cer', cer, global_step)
        writer.flush()

        # Compute the word error rate
        metric = torchmetrics.WordErrorRate()
        wer = metric(predicted, expected)
        print(f"WER: {wer:.3f}")
        writer.add_scalar('validation wer', wer, global_step)
        writer.flush()



In [None]:
def masked_loss(loss_fn, outputs, labels, pad_token=258):
    mask = (labels != pad_token).float()
    loss = loss_fn(outputs.view(-1, outputs.size(-1)), labels.view(-1)) * mask.view(-1)
    return loss.mean()

In [None]:
def train_model(df, epochs, lr, seq_len, src_len, tgt_len, d_model, show_samples, batch_size=32):
    # Define the device
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
    print("Using device:", device)
    if (device == 'cuda'):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    elif (device == 'mps'):
        print(f"Device name: <mps>")
    else:
        print("NOTE: If you have a GPU, consider using it for training.")
        print("      On a Windows machine with NVidia GPU, check this video: https://www.youtube.com/watch?v=GMSjDTU8Zlc")
        print("      On a Mac machine, run: pip3 install --pre torch torchvision torchaudio torchtext --index-url https://download.pytorch.org/whl/nightly/cpu")
    device = torch.device(device)

    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(df, seq_len, batch_size)
    model = get_model(src_len, tgt_len, seq_len, d_model).to(device)
    # Tensorboard
    writer = SummaryWriter('runs/train transformer')

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, eps=1e-9)

    # If the user specified a model to preload before training, load it
    initial_epoch = 0
    global_step = 0
    loss_fn = nn.CrossEntropyLoss(ignore_index=258, label_smoothing=0.1).to(device)

    for epoch in range(initial_epoch, epochs):
        pad = 258
        targets = []
        predictions = []
        masks = []
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_dataloader, desc=f"Processing Epoch {epoch:02d}")

        for batch in batch_iterator:

            encoder_input = batch['encoder_input'].to(device) # (b, seq_len)
            decoder_input = batch['decoder_input'].to(device) # (B, seq_len)
            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, seq_len)
            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, seq_len, seq_len)

            # Run the tensors through the encoder, decoder and the projection layer
            encoder_output = model.encode(encoder_input, encoder_mask) # (B, seq_len, d_model)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, seq_len, d_model)
            proj_output = model.project(decoder_output) # (B, seq_len, vocab_size)
            # Compare the output with the label
            label = batch['label'].to(device) # (B, seq_len)
            # Compute the loss using a simple cross entropy
            loss = masked_loss(loss_fn, proj_output, label)
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1
            targets += batch['label']
            predictions += proj_output.argmax(dim=2)
            masks += batch['label'] != pad


        # Run validation at the end of every epoch
        training_accuracy = accuracy(predictions, targets, masks)

        run_validation(model, val_dataloader, seq_len, device, lambda msg: batch_iterator.write(msg), global_step, writer, training_accuracy, show_samples)
        os.makedirs(f"/content/drive/MyDrive/transformers/training", exist_ok=True)
        model_filename = f"/content/drive/MyDrive/transformers/training/model_at_epoch_{epoch+1}"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'global_step': global_step
        }, model_filename)
    os.makedirs(f"/content/drive/MyDrive/transformers", exist_ok=True)
    model_filename = f"/content/drive/MyDrive/transformers/final_model"
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step
    }, model_filename)



### Set Model Hyperparameters

In [None]:
epochs = 20
lr = 0.0001
show_samples = 5 # Show samples of validation each epoch
seq_len = 21
src_len = 259
tgt_len = 259
d_model = 512
batch_size = 32
import warnings
warnings.filterwarnings("ignore")

In [None]:
#train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(df, seq_len, batch_size)

### Train The Model

In [None]:
train_model(df, epochs, lr, seq_len, src_len, tgt_len, d_model, show_samples, batch_size) # commented to doesn't override the cell output in presentation

### Load The Trainined Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.has_mps or torch.backends.mps.is_available() else "cpu"
model = get_model(src_len, tgt_len, seq_len, d_model).to(device)
#model_filename = "/content/drive/MyDrive/transformers/final_model"

model_filename = "/content/drive/MyDrive/transformers/training/model_at_epoch_20"
state = torch.load(model_filename, map_location=torch.device('cpu'))
model.load_state_dict(state['model_state_dict'])

<All keys matched successfully>

###Define Generate Utilities

In [None]:
def str_encoder(str_):
    return [int(x, 16) for x in str_.split()]

def str_decoder(str_):
    return [hex(x) for x in str_]

In [None]:
def check_packet_start(str_):
    return len(str_.split()) == 8

In [None]:
def generate_modbus_tcp_packet(model, packet_start, max_len, device):
    try:
        if check_packet_start(packet_start):
            packet_start = str_encoder(packet_start)
            input = torch.tensor(packet_start).unsqueeze(0).to(device)
            mask = torch.tensor([1]*8).to(device)
            model.eval()
            with torch.no_grad():
                output = greedy_decode(model, input, mask, max_len, device)
                output = str_decoder(output.detach().cpu().numpy()[1:-1])
                return " ".join(str_decoder(packet_start)) + " " + " ".join(output)
        else:
            return f"Packet start is not valid: {packet_start}, please enter packet start as 8 hexadecimal values separated with spaces."
    except:
        return f"Packet start is not valid: {packet_start}, please enter packet start as 8 hexadecimal values separated with spaces."

### Generate Modbus TCP Packet

In [None]:
packet_start = "0x0 0x1 0x0 0x0 0x0 0x6 0x2 0x6" #MBAP + FC
print(generate_modbus_tcp_packet(model, packet_start, seq_len, device))

0x0 0x1 0x0 0x0 0x0 0x6 0x2 0x6 0x0 0x24 0x0 0x0


In [None]:
" ".join

In [None]:
generated_packets = []
for index, row in df.iloc[:20000,:8].iterrows():
    packet_start = " ".join(row)
    generated_packet = generate_modbus_tcp_packet(model, packet_start, seq_len, device)
    generated_packets.append(generated_packet)

with open("/content/drive/MyDrive/transformers/packets.txt", "w") as f:
    for packet in generated_packets:
        f.write(packet + "\n")

