# THIS IS IMPLEMENTATION OF TRANSFORMER ARCHETECHTURE ON THE EMAIL DETECTION PROJECT

In [1]:
# import libs
import pandas as pd

**1. DATA REFORMATTING**

In [2]:
# load data from final_df.csv
final_df = pd.read_csv('final_df.csv')

# print head
print(final_df.head())
print(final_df.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'final_df.csv'

In [None]:
# remove nan values 
final_df = final_df.dropna()

In [None]:
# print shape
print(final_df.shape)

(518920, 2)


**NOW BUILD A CLASS FOR TRANSFORMER**

In [None]:
# import torch
import torch
import torch.nn as nn

In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, embed_size: int, vocab_size: int):
        super().__init__()
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.embed = nn.Embedding(vocab_size, embed_size)
    
    def forward(self, x):
        return self.embed(x) * torch.sqrt(self.embed_size) # i still don't know why we multiply by sqrt of embed_size 

In [None]:
class PositionEncoding(nn.Module):

    def __init__(self, embed_size: int = 512, max_len: int = 500, dropout: float):
        super().__init__()
        self.embed_size = embed_size
        self.max_len = max_len
        self.dropout = nn.Dropout(dropout)
        self.posi_embed = nn.Embedding(max_len, embed_size)

        # create a zero matrix 
        pe = torch.zeros(max_len, embed_size)
        # create the pos vector
        pos = torch.arange(0, max_len).unsqueeze(1)
        # calculate the div term
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(np.log(10000.0) / embed_size))
        # calculate the pos for even numbers starts from 0
        pe[:, 0::2] = torch.sin(pos * div_term)
        # calculate the pos for odd numbers starts from 1
        pe[:, 1::2] = torch.cos(pos * div_term)

        pe = pe.unsqueeze(0) 

        # save the pe as along with model
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape(1), :]).requires_grad_(False) # fixed

        return self.dropout(x) # prevent overfitting

In [None]:
# this class is crucial since we will use it many times
class LayerNormalizing(nn.Module):
    def __init__(self, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1)) # change 1 to embed_size if needed
        self.bias = nn.Parameter(torch.zeros(1))
    
    def forward(self, x):
        mean = x.mean(-1, keepdim = True)
        std = x.std(-1, keepdim = True)
        norm_x = (x - mean) / (std + self.eps)
        return self.alpha * norm_x  + self.bias

In [None]:
class FeedForward(nn.Module): # one hidden layer feed forward network
    def __init__(self, embed_size: int, ff_hidden_size: int = 2048, dropout: float):
        super().__init__()
        self.embed_size = embed_size
        self.ff_hidden_size = ff_hidden_size
        self.dropout = nn.Dropout(dropout)

        self.fc1 = nn.Linear(embed_size, ff_hidden_size)
        self.fc2 = nn.Linear(ff_hidden_size, embed_size)

    def forward(self, x):
        return self.fc2(self.dropout(torch.relu(self.fc1(x))))

In [None]:
class MultiHeadEncoder(nn.Module):
    def __init__(self, embed_size: int, n_heads: int = 128, dropout: float):
        super().__init__()
        self.embed_size = embed_size
        self.n_heads = n_heads
        assert embed_size % n_heads == 0, "Embedding size must be divisible by number of heads"

        self.head_size = embed_size // n_heads
        self.W_q = nn.Linear(embed_size, embed_size)
        self.W_k = nn.Linear(embed_size, embed_size)
        self.W_v = nn.Linear(embed_size, embed_size)

        self.W_o = nn.Linear(embed_size, embed_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, Q, K, V, mask):
        Q_p = self.W_q(Q)
        K_p = self.W_k(K)
        V_p = self.W_v(V)
        # slice the Q, K, V to heads
        Q_p = Q_p.view(Q_p.shape[0], Q_p.shape[1], self.n_heads, self.head_size).transpose(1, 2)
        K_p = K_p.view(K_p.shape[0], K_p.shape[1], self.n_heads, self.head_size).transpose(1, 2)
        V_p = V_p.view(V_p.shape[0], V_p.shape[1], self.n_heads, self.head_size).transpose(1, 2)

        # calculate the attention
        x, scores = self.attention(Q_p, K_p, V_p, mask, self.dropout)

        # concatenate the heads
        x = x.transpose(1, 2).contiguous().view(x.shape[0], x.shape[1], self.embed_size)

        return self.W_o(x)

    @staticmethod
    def attention(Q, K, V, mask, dropout):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(Q.shape[-1], dtype = torch.float32))
        # apply mask to hide some interactions
        if mask is not None:
            scores.masked_fill(mask == 0, -1e9)

        # scores is now a square matrix
        scores = scores.softmax(dim = -1)

        if dropout is not None:
            scores = dropout(scores)

        return torch.matmul(scores, V), scores


In [None]:
class ResConnection(nn.Module):

    def __init__(self, dropout):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalizing()
    
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
# put things together in one block only
class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block, feed_forward_block, dropout):  
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        # init 2 residual connections
        self.residual_connections = nn.ModuleList([ResConnection(dropout), ResConnection(dropout)])
        

    def forward(self, x, src_mask):
        # first residual connection
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        # second residual connection
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x


In [None]:
# here we serialize the encoder blocks
class EncoderChains(nn.Module):

    def __init__(self, encoder_blocks: nn.ModuleList):
        super().__init__()
        self.encoder_blocks = encoder_blocks
        self.norm = LayerNormalizing()
    
    def forward(self, x, src_mask):
        for block in self.encoder_blocks:
            x = block(x, src_mask)

        # finally normalize the output once again
        return self.norm(x)

In [None]:
class DecoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadEncoder, cross_attention_block: MultiHeadEncoder, feed_forward_block, dropout):
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        # 3 residual connections
        self.residual_connections = nn.ModuleList([ResConnection(dropout), ResConnection(dropout), ResConnection(dropout)])

    def forward(self, x, encoder_output, src_mask, tgt_mask): # tgt_mask is for decoder, src_mask is for encoder
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [None]:
class DecoderChains(nn.Module):
    
        def __init__(self, decoder_blocks: nn.ModuleList):
            super().__init__()
            self.decoder_blocks = decoder_blocks
            self.norm = LayerNormalizing()
        
        def forward(self, x, encoder_output, src_mask, tgt_mask):
            for block in self.decoder_blocks:
                x = block(x, encoder_output, src_mask, tgt_mask)
            
            return self.norm(x)

In [None]:
class Project_to_Vocab(nn.Module):

    def __init__(self, embed_size, vocab_size):
        super().__init__()
        self.proj = nn.Linear(embed_size, vocab_size)
    
    def forward(self, x): 
        return torch.log_softmax(self.proj(x), dim = -1)

In [None]:
class Transformer(nn.Module):

    def __init__(self, encoder, decoder, src_embed, tgt_embed, src_pos_encod, tgt_pos_encod, project_back):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos_encod = src_pos_encod
        self.tgt_pos_encod = tgt_pos_encod
        self.project_back = project_back