In [1]:
import import_ipynb
import torch 
import torch.nn as nn
from EncoderBlock import Encoder, InputEmbeddingsLayer, PositionalEncodingLayer, EncoderBlock, FeedForwardBlock, MultiHeadAttentionBlock 
from DecoderBlock import Decoder, InputEmbeddingsLayer, PositionalEncodingLayer, DecoderBlock, FeedForwardBlock, MultiHeadAttentionBlock 

importing Jupyter notebook from EncoderBlock.ipynb
importing Jupyter notebook from DecoderBlock.ipynb


In [None]:
class LinearLayer(nn.Module):

    def __init__(self, d_model: int, vocab_size: int) -> None:
        super().__init__()
        self.Linear = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return torch.log_softmax(self.Linear(x))


In [None]:
class TransformerBlock(nn.Module):

    def __init__(self, encoder: Encoder, decoder: Decoder, source_embedding: InputEmbeddingsLayer, target_embedding: InputEmbeddingsLayer, source_position: PositionalEncodingLayer, target_position: PositionalEncodingLayer, Linear: LinearLayer) -> None:
        super().__init__()
        self.encoder = encoder 
        self.decoder = decoder 
        self.source_embedding = source_embedding
        self.target_embedding = target_embedding
        self.source_position = source_position
        self.target_position = target_position
        self.Linear = Linear

    def encode(self, source_language, source_mask):
        source_language = self.source_embedding(source_language)
        source_language = self.source_position(source_language)
        return self.encoder(source_language, source_mask)

    def decode(self, Encoder_output, mask, target_language, target_mask):
        target_language = self.target_embedding(target_language)
        target_language = self.target_position(target_language)
        return self.decoder(target_language, Encoder_output, mask, target_mask)

    def linear(self, x):
        return self.Linear(x)


def TransformerModel(source_vocab_size: int, target_vocab_size: int, source_sequence_length: int, target_sequence_length: int, d_model: int = 512, Layers: int = 6, heads: int = 8, dropout: float = 0.1, d_ff: int = 2048)->TransformerBlock:

    source_embedding = InputEmbeddingsLayer(d_model, source_vocab_size)
    source_position = PositionalEncodingLayer(d_model, source_sequence_length, dropout)
    
    target_embedding = InputEmbeddingsLayer(d_model, target_vocab_size)
    target_position = PositionalEncodingLayer(d_model, target_sequence_length, dropout)

    EncoderBlocks = []
    for _ in range(Layers):
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, heads, dropout)
        encoder_feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        encoder_block = EncoderBlock(encoder_self_attention_block, encoder_feed_forward_block, dropout)
        EncoderBlocks.append(encoder_block)

    
    DecoderBlocks = []
    for _ in range(Layers):
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, heads, dropout)
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, heads, dropout)
        decoder_feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout)
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, decoder_feed_forward_block, dropout)
        DecoderBlocks.append(decoder_block)


    encoder = Encoder(nn.ModuleList(EncoderBlocks))
    decoder = Decoder(nn.ModuleList(DecoderBlocks))

    linear = LinearLayer(d_model, target_vocab_size)

    Transformer = TransformerBlock(encoder, decoder, source_embedding, target_embedding, source_position, target_position, linear)

    for T in Transformer.parameters():
        if T.dim() > 1:
            nn.init.xavier_uniform(T)

    return Transformer
