<a href="https://colab.research.google.com/github/Anvians/Deep_Learning/blob/main/Transformers_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

In [None]:
dataset = load_dataset("liar",trust_remote_code=True)

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['statement'], padding="max_length", truncation=True)

In [6]:
train_dataset = dataset['train'].map(tokenize_function, batched=True, remove_columns=["statement"])
test_dataset = dataset['test'].map(tokenize_function, batched=True, remove_columns=["statement"])

# Building my own Transformer

In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Encoding


**Positional Encoding**

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self, embedding_dim, max_seq_length):
        super().__init__()
        self.encoding = torch.zeros(max_seq_length, embedding_dim)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embedding_dim))

        self.encoding[:, 0::2] = torch.sin(position / div_term)
        self.encoding[:, 1::2] = torch.cos(position / div_term)
        self.encoding = self.encoding.unsqueeze(0)  # Shape: (1, max_seq_length, embedding_dim)

    def forward(self, x):
        seq_length = x.size(1)
        return x + self.encoding[:, :seq_length, :].to(x.device)

**Multihead Attention**

In [12]:
class MultiheadAttention(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.q_layer = nn.Linear(embedding_dim, embedding_dim)
        self.k_layer = nn.Linear(embedding_dim, embedding_dim)
        self.v_layer = nn.Linear(embedding_dim, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, embedding_dim)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    def forward(self, q, k, v, mask=None):
        batch_size, seq_len, embedding_dim = q.size()

        Q = self.q_layer(q).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        K = self.k_layer(k).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V = self.v_layer(v).view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        atten_score = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        if mask is not None:
            atten_score = atten_score.masked_fill(mask == 0, -1e9)

        atten_weight = torch.softmax(atten_score, dim=-1)
        atten_output = torch.matmul(atten_weight, V)

        atten_output = atten_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, embedding_dim)
        return self.output_layer(atten_output), atten_weight

**ADD and normalization**

In [13]:
class AddNorm(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.norm = nn.LayerNorm(embedding_dim)

    def forward(self, x, sublayer_output):
        return self.norm(x + sublayer_output)

**Feed Forward NN**

In [14]:
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__()
        self.layer1 = nn.Linear(embedding_dim, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, embedding_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.layer2(self.relu(self.layer1(x)))

In [15]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, embedding_dim=768, num_heads=12, ff_hidden_dim=3072, dropout=0.1):
        super().__init__()
        self.attention = MultiheadAttention(embedding_dim, num_heads)
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.ffn = FeedForward(embedding_dim, ff_hidden_dim)
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Multihead Attention + Residual Connection + Layer Norm
        attn_output, _ = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        # Feed-Forward + Residual Connection + Layer Norm
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))

        return x

In [16]:
class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, vocab_size, embedding_dim, num_heads, hidden_dim, max_seq_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, max_seq_length)
        self.encoder_layers = nn.ModuleList([TransformerEncoderBlock(embedding_dim, num_heads, hidden_dim) for _ in range(num_layers)])

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.positional_encoding(x)
        for layer in self.encoder_layers:
            x = layer(x)
        return x


In [30]:
from torch.utils.data import DataLoader
num_layers = 6
embedding_dim = 768
num_heads = 12
hidden_dim = 3072
max_seq_length = 512
vocab_size = tokenizer.vocab_size

encoder = TransformerEncoder(num_layers, vocab_size, embedding_dim, num_heads, hidden_dim, max_seq_length).to(device)

batch_size = 8  # Adjust based on memory
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for batch in train_dataloader:
    input_ids = torch.stack([torch.tensor(ids, dtype=torch.long) for ids in batch['input_ids']]).to(device)
    output = encoder(input_ids)





  input_ids = torch.stack([torch.tensor(ids, dtype=torch.long) for ids in batch['input_ids']]).to(device)


##Decoding

In [24]:
class TransformerDecoderBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads, ff_hidden_dim, dropout=0.1):
        super().__init__()

        self.self_attention = MultiheadAttention(embedding_dim, num_heads)  # Masked Self-Attention
        self.norm1 = nn.LayerNorm(embedding_dim)
        self.dropout1 = nn.Dropout(dropout)

        self.encoder_decoder_attention = MultiheadAttention(embedding_dim, num_heads)  # Encoder-Decoder Attention
        self.norm2 = nn.LayerNorm(embedding_dim)
        self.dropout2 = nn.Dropout(dropout)

        self.ffn = nn.Sequential(
            nn.Linear(embedding_dim, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, embedding_dim)
        )
        self.norm3 = nn.LayerNorm(embedding_dim)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, x, encoder_output, mask=None):
        # Masked Multi-Head Self-Attention (Decoder can’t see future tokens)
        _x, _ = self.self_attention(x, x, x, mask)
        x = self.norm1(x + self.dropout1(_x))  # Add & Norm

        # Encoder-Decoder Cross-Attention (Queries from decoder, Keys/Values from encoder)
        _x, _ = self.encoder_decoder_attention(x, encoder_output, encoder_output)
        x = self.norm2(x + self.dropout2(_x))  # Add & Norm

        # Feed-Forward Network
        _x = self.ffn(x)
        x = self.norm3(x + self.dropout3(_x))  # Add & Norm

        return x


In [28]:
# Define decoder block
decoder_block = TransformerDecoderBlock(embedding_dim=768, num_heads=8, ff_hidden_dim=2048).to(device)

# Dummy tensor for decoder input (assume batch_size=1, seq_len=10)
decoder_input = torch.rand(1, 10, 768).to(device)

# Assume encoder output is ready
encoder_output = torch.rand(1, 10, 768).to(device)

# Apply decoder block
decoder_output = decoder_block(decoder_input, encoder_output)

print("Decoder Output Shape:", decoder_output.shape)  # Expected: (batch_size, seq_len, embedding_dim)


Decoder Output Shape: torch.Size([1, 10, 768])


In [29]:
import torch
import torch.nn as nn

class Transformer(nn.Module):
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 embedding_dim=512,
                 num_heads=8,
                 ff_hidden_dim=2048,
                 num_layers=6,
                 dropout=0.1):
        super().__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, embedding_dim)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embedding_dim)
        self.positional_encoding = PositionalEncoding(embedding_dim, dropout)

        # Stack 6 encoder blocks
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderBlock(embedding_dim, num_heads, ff_hidden_dim, dropout)
            for _ in range(num_layers)
        ])

        # Stack 6 decoder blocks
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderBlock(embedding_dim, num_heads, ff_hidden_dim, dropout)
            for _ in range(num_layers)
        ])

        self.final_linear = nn.Linear(embedding_dim, tgt_vocab_size)

    def encode(self, src, src_mask=None):
        src = self.src_embedding(src)
        src = self.positional_encoding(src)
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        return src  # Output of encoder

    def decode(self, tgt, encoder_output, tgt_mask=None):
        tgt = self.tgt_embedding(tgt)
        tgt = self.positional_encoding(tgt)
        for layer in self.decoder_layers:
            tgt = layer(tgt, encoder_output, tgt_mask)
        return tgt  # Output of decoder

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        encoder_output = self.encode(src, src_mask)
        decoder_output = self.decode(tgt, encoder_output, tgt_mask)
        return self.final_linear(decoder_output)  # Final projection to vocabulary size
