<a href="https://colab.research.google.com/github/AzeemWaqarRao/Pytorch_Implementations/blob/main/TransformersImplementationPytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing required libraries
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import random
from __future__ import unicode_literals, print_function, division
from io import open
import pandas as pd
import numpy as np
import unicodedata
import seaborn as sns
import torchtext
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

warnings.simplefilter("ignore")
print(torch.__version__)

2.0.1+cu118


In [None]:
# helper functions

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()


# reading data from file
def filter_pairs(pairs, MAXLENGTH):
  new_pairs = []
  for pair in pairs:
    if len(pair[0].split(' ')) <=MAXLENGTH and len(pair[1].split(' ')) <=MAXLENGTH:
      new_pairs.append(pair)
  return new_pairs


def read_data(path,lang1,lang2,MAXLENGTH):
  with open(path, 'r') as f:
    lines = f.read().strip().split('\n')
    lines = [[normalizeString(sent.lower()) for sent in line.split('\t')] for line in lines]

  input_lang = Lang(lang1)
  output_lang = Lang(lang2)

  for line in lines:
    input_lang.addSentence(line[0])
    output_lang.addSentence(line[1])


  lines = filter_pairs(lines,MAXLENGTH)

  return input_lang, output_lang, lines


def padd_seq(seq,length,place=0):
  if len(seq) < length:
    for i in range(length-len(seq)):
      seq.append(0)
  return seq

def sent_to_index(lines,input_lang,output_lang,MAXLENGTH):
  inputs = []
  targets = []
  for line in lines:
    inp = padd_seq(input_lang.sent_to_index(line[0]),MAXLENGTH+1)
    targ = padd_seq(output_lang.sent_to_index(line[1]),MAXLENGTH+1)
    inputs.append(inp)
    targets.append(targ)

  return inputs, targets


# gives us a data loader with inputs and targets

def get_dataloader(inputs,outputs,batch_size,device):
  train_data = TensorDataset(torch.tensor(inputs,dtype=torch.long, device=device),
                               torch.tensor(targets,dtype=torch.long,device=device))
  train_data = DataLoader(train_data,batch_size=batch_size)
  return train_data

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {'SOS':0 , 'EOS':1}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def sent_to_index(self,sent):
      sent =  [self.word2index[word] for word in sent.split(' ')]
      sent.append(EOS_token)
      return sent

In [None]:
"""
Creates embeddings for input sequences
(*) -> (*, H)
where H is dimension size
"""
class Embeddings(nn.Module):
  def __init__(self, vocab_size, encode_dim):
    super(Embeddings, self).__init__()
    self.vocab_size = vocab_size
    self.encode_dim = encode_dim

    self.embed = nn.Embedding(self.vocab_size, self.encode_dim)

  def forward(self, x):
    return self.embed(x)

In [None]:
# class PositionalEncoding(nn.Module):
#   def __init__(self, seq_len, embed_dim):
#     super(PositionalEncoding, self).__init__()

#     self.embed_dim = embed_dim
#     pe = torch.zeros(seq_len, self.embed_dim)
#     for pos in range(seq_len):
#       for i in range(0, self.embed_dim, 2):
#         pe[pos,i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
#         pe[pos,i+1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim)))

#     pe = pe.unsqueeze(0)
#     self.register_buffer('pe', pe)

#   def forward(self, x):
#     x = x * math.sqrt(self.embed_dim)
#     seq_len = x.size(1)
#     x_new = x[:,:-1] + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
#     x = torch.cat([x_new,x[:,-1].unsqueeze(dim=1)],dim=1)
#     return x




In [None]:
class SelfAttention(nn.Module):
  def __init__(self, n_heads, embed_dim):
    super(SelfAttention, self).__init__()
    self.n_heads = n_heads
    self.embed_dim = embed_dim

    self.head_dim = int(self.embed_dim / self.n_heads)

    self.k_mat = nn.Linear(self.head_dim, self.head_dim, bias = False)
    self.v_mat = nn.Linear(self.head_dim, self.head_dim, bias = False)
    self.q_mat = nn.Linear(self.head_dim, self.head_dim, bias = False)
    self.linear = nn.Linear(self.n_heads * self.head_dim, self.embed_dim)


  def forward(self, key, query, val, mask=None):
    self.batch_size = key.shape[0]
    self.seq_len = key.shape[1]
    self.seq_len_query = query.shape[1]

    # old shape -> (32,10,512)
    key = key.view(self.batch_size, self.seq_len, self.n_heads, self.head_dim)
    query = query.view(self.batch_size, self.seq_len_query, self.n_heads, self.head_dim)
    val = val.view(self.batch_size, self.seq_len, self.n_heads, self.head_dim)
    # new shape -> (32,10,8,64)



    k = self.k_mat(key)
    q = self.q_mat(query)
    v = self.v_mat(val)


    k = k.permute(0,2,1,3)
    q = q.permute(0,2,1,3)
    v = v.permute(0,2,1,3)
    # new shape -> (32,8,10,64) ,, for matrix multiplication

    scores = torch.matmul(q, k.permute(0,1,3,2))
    # k will become -> (32,8,64,10)
    # scores will be -> (32,8,10,10)
    # fill those positions of scores matrix as (-1e20) where mask positions are 0
    if mask is not None:
      scores = scores.masked_fill(mask == 0, float("-1e20"))


    scores = scores / math.sqrt(self.head_dim) # 8

    scores = F.log_softmax(scores, dim=-1)

    z = torch.matmul(scores, v)
    # z -> (32,8,10,64)

    z = z.permute(0,2,1,3)
    # z -> (32,10,8,64)

    z = z.contiguous().view(self.batch_size, self.seq_len_query, self.n_heads * self.head_dim)
    # z -> (32,10,512)

    return self.linear(z)


In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, n_heads, embed_dim, expansion):
    super(EncoderBlock, self).__init__()
    self.n_heads = n_heads
    self.embed_dim = embed_dim


    self.attention = SelfAttention(self.n_heads, self.embed_dim)
    self.ffnn = nn.Sequential(
        nn.Linear(self.embed_dim, self.embed_dim * expansion),
        nn.ReLU(),
        nn.Linear(self.embed_dim * expansion, self.embed_dim)
    )

    self.norm1 = nn.LayerNorm(self.embed_dim)
    self.norm2 = nn.LayerNorm(self.embed_dim)

    self.dropout1 = nn.Dropout(0.2)
    self.dropout2 = nn.Dropout(0.2)


  def forward(self, key, query, val):
    z = self.attention(key, query, val)
    z_norm = self.dropout1(self.norm1(z+val))
    # applying residual connection and normalization


    out = self.ffnn(z_norm)
    out_norm = self.dropout2(self.norm2(out + z_norm))
    # applying residual connection and normalization

    return out_norm

class Encoder(nn.Module):
  def __init__(self, vocab_size, embed_dim, seq_len, n_heads = 8, expansion = 4, num_layers = 4):
    super(Encoder, self).__init__()

    self.num_layers = num_layers
    self.embedding = Embeddings(vocab_size,embed_dim)
    # self.poe = PositionalEncoding(seq_len, embed_dim)
    self.poe = Embeddings(seq_len+1, embed_dim)

    self.layers = nn.ModuleList([EncoderBlock(n_heads,embed_dim,expansion) for _ in range(self.num_layers)])

  def forward(self, x):
    # x -> (32,10)
    batch_size, seq_len = x.shape
    positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)

    x = self.embedding(x) + self.poe(positions)
    # x -> (32,10,512)
    # x = self.poe(x)
    for layer in self.layers:
      x = layer(x, x, x)

    return x # (32,10,512)

In [None]:
# encoder = Encoder(1100, 512, 10, 8, 4, 2)
# import numpy
# arr = np.random.randint(0,1100,32*10).reshape(32,10)
# arr = torch.tensor(arr)
# encoder(arr).shape

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, n_heads, embed_dim, expansion):
    super(DecoderBlock, self).__init__()
    self.attention = SelfAttention(n_heads, embed_dim)
    self.ed_attention = SelfAttention(n_heads, embed_dim)
    self.ffnn = nn.Sequential(
        nn.Linear(embed_dim, embed_dim*expansion),
        nn.ReLU(),
        nn.Linear(embed_dim*expansion, embed_dim)
    )

    self.dropout1 = nn.Dropout(0.2)
    self.dropout2 = nn.Dropout(0.2)
    self.dropout3 = nn.Dropout(0.2)

    self.norm1 = nn.LayerNorm(embed_dim)
    self.norm2 = nn.LayerNorm(embed_dim)
    self.norm3 = nn.LayerNorm(embed_dim)

  def forward(self, key, query, val, mask):

    attention = self.attention(query, query, query, mask)
    attention_out = self.dropout1(self.norm1(attention + query))



    ed_attention = self.ed_attention(key, attention_out, val= key)
    ed_attention_norm = self.dropout2(self.norm2(ed_attention + attention_out))

    output = self.ffnn(ed_attention_norm)
    output_norm = self.dropout3(self.norm3(output + ed_attention_norm))
    return output_norm



class Decoder(nn.Module):
  def __init__(self, embed_dim, vocab_size, seq_len, expansion=4, num_layers=2, n_heads=8):
    super(Decoder, self).__init__()

    # self.poe = PositionalEncoding(seq_len, embed_dim)
    self.poe = Embeddings(seq_len+1, embed_dim)
    self.embedding = Embeddings(vocab_size, embed_dim)
    self.layers = nn.ModuleList([DecoderBlock(n_heads, embed_dim, expansion) for _ in range(num_layers)])
    self.fc = nn.Linear(embed_dim, vocab_size)
    self.dropout = nn.Dropout(0.2)

  def forward(self, encoder_output, x, mask):

    # x -> 32, 10
    batch_size, seq_len = x.shape
    positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)

    x = self.embedding(x) + self.poe(positions)
    # x -> 32, 10, 512
    # x = self.poe(x)

    x = self.dropout(x)

    for layer in self.layers:
      x = layer(encoder_output, x, encoder_output, mask)

    output = F.softmax(self.fc(x))

    return output

In [None]:
# arr = torch.tensor(np.random.randint(0,1100,32*1).reshape(32,1), dtype=torch.long)
# enc = torch.tensor(np.random.randint(0,1100,32*10*512).reshape(32,10,512), dtype=torch.float)
# decoder = Decoder(512,1100,10,4,2,8)

In [None]:
# decoder(enc, arr,None).shape

In [None]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder, embed_dim, seq_len, device):
    super(Transformer, self).__init__()
    self.seq_len = seq_len
    self.embed_dim = embed_dim
    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self,src,trg):
    trg_mask = self.create_mask(trg)
    enc_out = self.encoder(src)
    output = self.decoder(enc_out, trg, trg_mask)

    return output


  def decode(self,src, trg):
    mask = self.create_mask(trg)
    enc_out = self.encoder(src)
    out_labels = []
    batch_size, seq_len = src.shape[0], src.shape[1]
    out = trg
    for i in range(seq_len):
      out = self.decoder(enc_out, out, mask)
      out_labels.append(out[:,-1, :].unsqueeze(1))
      _, out = out.topk(1)
      out = out.squeeze(-1)
      out = out.long()

    out_labels = torch.cat(out_labels, dim=1)

    return (out_labels)


  def create_mask(self, trg):
    batch_size, trg_len = trg.shape
    # returns the lower triangular part of matrix filled with ones
    trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
        batch_size, 1, trg_len, trg_len
    )
    return trg_mask.to(self.device)

In [None]:
# t = Transformer(1100,1100,512,10,8,4,4)
# arr = torch.tensor(np.random.randint(0,1100,32*10).reshape(32,10), dtype=torch.long)
# words = t(arr, arr)
# words.shape

In [None]:
# item = t.decode(arr,arr)
# item.shape

In [None]:
hidden_size = 512
lr = 0.0001
epochs = 5
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path = '/content/eng-fra.txt'
MAXLENGTH = 10
num_layers = 2
expansion = 4
n_heads = 8


input_lang, output_lang, lines = read_data(path,'English', 'French',MAXLENGTH)

encoder = Encoder(input_lang.n_words,hidden_size, MAXLENGTH,n_heads, expansion, num_layers).to(device)
decoder = Decoder(hidden_size, output_lang.n_words, MAXLENGTH, expansion, num_layers, n_heads).to(device)




inputs, targets = sent_to_index(lines,input_lang, output_lang,MAXLENGTH)
data_loader = get_dataloader(inputs,targets,batch_size, device)
# _,_,data_loader = get_dataloader(batch_size)

num_batches = len(data_loader)

transformer = Transformer(encoder, decoder, hidden_size, MAXLENGTH, device)

optimizer = torch.optim.Adam(transformer.parameters(), lr)


loss_fn = nn.CrossEntropyLoss()

In [None]:
# arr = torch.tensor(np.random.randint(0,1100,1*11).reshape(1,11), dtype=torch.long).to(device)


In [None]:
# transformer.decode(arr, arr).shape

In [None]:
for epoch in range(1):
  print(f"Epoch {epoch+1} starting")
  total_loss = 0
  for batch in data_loader:
    input, target = batch

    output = transformer(input, target)

    loss = loss_fn(
            output.view(-1, output.size(-1)),
            target.view(-1)
        )
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

  print(f"Loss : {total_loss/num_batches}")
  print(f"Epoch {epoch+1} ended")




Epoch 1 starting
Loss : -0.03128795288554767
Epoch 1 ended


In [None]:
def evaluateRandomly(transformer, n=10):
    for i in range(n):
        pair = random.choice(lines)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(transformer, pair, input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')


In [None]:
def evaluate(transformer, pair, input_lang, output_lang):
    with torch.no_grad():
        sentence = pair[0]
        target = pair[1]
        input_tensor = input_lang.sent_to_index(sentence)
        input_tensor = padd_seq(input_tensor,MAXLENGTH+1)

        input_tensor = torch.LongTensor(input_tensor).view(1,-1).to(device)

        target_tensor = output_lang.sent_to_index(target)
        target_tensor = padd_seq(target_tensor,MAXLENGTH+1)

        target_tensor = torch.LongTensor(target_tensor).view(1,-1).to(device)

        outputs = transformer.decode(input_tensor, target_tensor)

        _, topi = outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words

In [None]:
transformer.eval()
evaluateRandomly(transformer)

> he died yesterday
= il a clamece hier
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> what s the name of your insurance company ?
= quel est le nom de ta compagnie d assurance ?
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> he never cared much for me
= il ne s est jamais beaucoup preoccupe de moi
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> don t tell my girlfriend
= ne dites rien a ma petite amie !
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> i asked for a seat in the no smoking section
= j ai demande une place en non fumeur
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> please eat some cake
= s il te plait mange un peu de gateau !
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> i think it s time for me to shove off
= je pense qu il est temps pour moi de partir
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> they lie all the time
= elles mentent tout le temps
< SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS SOS

> can t we keep this between us ?
= ne pouvons nous pas garder