In [1]:
import requests
from pathlib import Path

from sympy.physics.units import frequency
from torch.nn.functional import embedding


In [2]:
dir=Path("files")
dir.mkdir(parents=True, exist_ok=True)


In [3]:
req=requests.get("https://raw.githubusercontent.com/neychev/"
     "small_DL_repo/master/datasets/Multi30k/training.tar.gz")
with open("files/training.tar.gz", "wb") as f:
    f.write(req.content)


Extracting the tar file

In [1]:
import tarfile
#To read TarFile
train=tarfile.open('files/training.tar.gz')    #C
train.extractall('files',filter="fully_trusted")    #D
train.close()

In [3]:
with open("./train.de", "rb") as f:
    trainde=f.readlines()

In [4]:
with open("./train.en", "rb") as f:
    trainen=f.readlines()

#Converting the read(decompressed files into arrays of phrases seperated by Lines)

In [5]:

trainde=[i.decode("utf-8").strip() for i in trainde]

In [6]:
trainen=[i.decode("utf-8").strip() for i in trainen]

In [7]:
trainen[2900]

'A person looks out of the window on the A bus.'

SPACY:Introduction

In [8]:
import spacy,os


Setup for spacy

En->English, de->German Tokenizer

In [9]:

de_token=spacy.load('de_core_news_sm')



In [10]:
en_token=spacy.load('en_core_web_sm')



In [11]:
token_single_de=[tok.text for tok in
              de_token.tokenizer(trainde[0])]
token_single_en=[tok.text for tok in
              en_token.tokenizer(trainen[0])]
print(token_single_en)
print(token_single_de)

['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']
['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


In [11]:
trainen[0]

'Two young, White males are outside near many bushes.'

Building A dictionary for the above Token; Each Unique word is reprensted by a unique indecx in the dictionary

In [12]:
from collections import Counter


In [13]:
token_full_en=[["BOS"]+[tok.text for tok in en_token.tokenizer(x)]+["EOS"] for x in trainen ]
token_full_de=[["BOS"]+[tok.text for tok in en_token.tokenizer(x)]+["EOS"] for x in trainde ]

English Dict

In [14]:
en_word_count=Counter()
for sent in token_full_en:
    for word in sent:
        en_word_count[word]+=1
PAD=0
UNK=1
frequency_en=en_word_count.most_common(50000)
en_total_word=len(frequency_en)+2
#Counter-> Returns the word Itself and the count

en_word_dict={w[0]:idx+2 for idx,w in enumerate(frequency_en)}
en_word_dict["PAD"]=PAD
en_word_dict["UNK"]=UNK
en_idx_dict={v:k for k,v in en_word_dict.items()}



In [15]:
endix=[en_word_dict.get(i,UNK) for i in token_single_en]
print(endix)

[19, 25, 15, 1165, 804, 17, 57, 84, 334, 1329, 5]


In [16]:
print([en_idx_dict.get(i,"UNK") for i in endix])

['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


German Dict

In [17]:
de_word_count=Counter()
for sent in token_full_de:
    for word in sent:
        de_word_count[word]+=1
de_frequency=de_word_count.most_common(50000)
de_total_word=len(de_frequency)+2
de_word_dict={w[0]:idx+2 for idx,w in enumerate(de_frequency)}
de_word_dict["PAD"]=PAD
de_word_dict["UNK"]=UNK
de_idx_dict={v:k for k,v in de_word_dict.items()}

In [18]:
de_idx=[de_word_dict.get(i,UNK) for i in token_single_de]
print(de_idx)

[21, 87, 221, 33, 89, 1, 97, 7, 16, 117, 5538, 3208, 4]


In [19]:
print([en_word_dict.get(i,"Unknown") for i in token_full_en[1]])

[3, 164, 36, 7, 335, 286, 17, 1208, 2, 753, 3933, 2710, 5, 4]


In [20]:
a=[en_word_dict.get(i,"Unknown") for i in token_full_en[1]]
print([en_idx_dict.get(i,"Unknown") for i in a])

['BOS', 'Several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.', 'EOS']


In [21]:
print([de_idx_dict.get(i,UNK) for i in de_idx])

['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'UNK', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.']


Word dictionary


Key,Val=word,id


Index Dictionary


Key,val->Index,word

In [22]:
#Adding Padding So that the dictionary is of Equal Length


out_en_ids=[[en_word_dict.get(w,UNK) for w in s]
            for s in token_full_en]
out_de_ids=[[de_word_dict.get(w,UNK) for w in s]
            for s in token_full_de]


In [23]:
#Sorts the lists basesd on the lomngest german phrase : and Aligns it with the English Phrases for a Sample -Label Pairing
sorted_ids=sorted(range(len(out_de_ids)),
                  key=lambda x:len(out_de_ids[x]))

out_de_ids=[out_de_ids[x] for x in sorted_ids]
out_en_ids=[out_en_ids[x] for x in sorted_ids]

Making Training Batches

In [24]:
import numpy as np
batch_size=128
idx_list=np.arange(0,len(token_full_de),batch_size)
np.random.shuffle(idx_list)
batch_indexs=[]
for idx in idx_list:
    batch_indexs.append(np.arange(idx,min(len(token_full_de),idx+batch_size)))

#Similar to dataloader ,just more formal approach

In [25]:
len(out_en_ids)

29001

In [32]:
def seq_padding(X, padding=PAD):
    L = [len(x) for x in X]
    ML = max(L)
    padded_seq = np.array([np.concatenate([x,
                   [padding] * (ML - len(x))])
        if len(x) < ML else x for x in X])
    return padded_seq

In [27]:
import torch

In [28]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [29]:


device

device(type='cuda', index=0)

Self-Attention MASKING

In [33]:
#During Training Decoder is autoregressive via masking"
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape),
                              k=1).astype('uint8')
    output = torch.from_numpy(subsequent_mask) == 0
    return output

def make_std_mask(tgt, pad):
    tgt_mask=(tgt != pad).unsqueeze(-2)
    output=tgt_mask & subsequent_mask(\
        tgt.size(-1)).type_as(tgt_mask.data)
    return output

class Batch:
    def __init__(self, src, trg=None, pad=0):
        src = torch.from_numpy(src).to(device).long()
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            trg = torch.from_numpy(trg).to(device).long()
            #Targ is the decoder input during training
            self.trg = trg[:, :-1]

            #Targ_y is the decoder expected output->during testing
            #Non Auto Regressive
            self.trg_y = trg[:, 1:]

            self.trg_mask = make_std_mask(self.trg, pad)
            #ntokens counts the number of real tokens
            self.ntokens = (self.trg_y != pad).data.sum()


In [31]:

batches=[]
for b in batch_indexs:

    batch_en=[out_en_ids[x] for x in b]
    batch_de=[out_de_ids[x] for x in b]

    batch_en=seq_padding(batch_en)
    batch_de=seq_padding(batch_de)
    batches.append(Batch(batch_de,batch_en))

torch.Size([128, 12])

In [34]:
print([en_idx_dict.get(i,UNK) for i in out_en_ids[100]])

['BOS', 'Looks', 'like', 'students', 'are', 'in', 'a', 'laboratory', '.', 'EOS']


In [35]:
print([de_idx_dict.get(i,UNK) for i in out_de_ids[100]])

['BOS', 'Schüler', 'in', 'einem', 'Labor', '.', 'EOS']


Word Embedding

In [36]:
print(len(de_word_dict)),print(len(en_word_dict))
src_vocab=len(de_word_dict)
trg_vocab=len(en_word_dict)
#UniqueWords int german and English Phrases

18544
10837


In [37]:
import math
from torch import nn
d_model=256

In [39]:
class Embeddings(nn.Module):
    def __init__(self,d_model,vocab):
        super().__init__()
        self.lut=nn.Embedding(vocab,d_model)
        self.d_model=d_model

    def forward(self, x):
        out=self.lut(x)*math.sqrt(self.d_model)
        return out


In [40]:
#Cool Python Facts
[1,2]*4

[1, 2, 1, 2, 1, 2, 1, 2]

# Final input to transformer
input_representation = embedding × √d_model + positional_encoding

Positonal Encoding

In [40]:
#INPUT to Positional Encoding will be the output of Embeddings of the sentences

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model, device=device)
        position = torch.arange(0., max_len,
                                device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(
            0., d_model, 2, device=device)
            * -(math.log(10000.0) / d_model))
        pe_pos = torch.mul(position, div_term)
        pe[:, 0::2] = torch.sin(pe_pos)
        pe[:, 1::2] = torch.cos(pe_pos)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].requires_grad_(False)
        out = self.dropout(x)
        return out

tensor([[  0.],
        [  1.],
        [  2.],
        [  3.],
        [  4.],
        [  5.],
        [  6.],
        [  7.],
        [  8.],
        [  9.],
        [ 10.],
        [ 11.],
        [ 12.],
        [ 13.],
        [ 14.],
        [ 15.],
        [ 16.],
        [ 17.],
        [ 18.],
        [ 19.],
        [ 20.],
        [ 21.],
        [ 22.],
        [ 23.],
        [ 24.],
        [ 25.],
        [ 26.],
        [ 27.],
        [ 28.],
        [ 29.],
        [ 30.],
        [ 31.],
        [ 32.],
        [ 33.],
        [ 34.],
        [ 35.],
        [ 36.],
        [ 37.],
        [ 38.],
        [ 39.],
        [ 40.],
        [ 41.],
        [ 42.],
        [ 43.],
        [ 44.],
        [ 45.],
        [ 46.],
        [ 47.],
        [ 48.],
        [ 49.],
        [ 50.],
        [ 51.],
        [ 52.],
        [ 53.],
        [ 54.],
        [ 55.],
        [ 56.],
        [ 57.],
        [ 58.],
        [ 59.],
        [ 60.],
        [ 61.],
        

Multi Head Attention & Feed Forward Network

In [41]:
from copy import deepcopy

In [44]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query,
              key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = nn.functional.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = nn.ModuleList([deepcopy(
            nn.Linear(d_model, d_model)) for i in range(4)])
        self.attn =None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query, key, value = [l(x).view(nbatches, -1, self.h,
           self.d_k).transpose(1, 2)
        #The transpose is done to enable Matrix Multiplication->mentioned in Hugging Face

         for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1, 2).contiguous().view(
            nbatches, -1, self.h * self.d_k)
        output = self.linears[-1](x)
        return output
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h1 = self.w_1(x)
        h2 = self.dropout(h1)
        return self.w_2(h2)



ADD & Norm Layers

In [43]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        output = x + self.dropout(sublayer(self.norm(x)))
        return output

class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        x_zscore = (x - mean) / torch.sqrt(std ** 2 + self.eps)
        output = self.a_2*x_zscore+self.b_2

        return output

Encoder Schema

In [45]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = nn.ModuleList(
            [deepcopy(layer) for i in range(N)])
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
            output = self.norm(x)
        return output




Encoder Layers

In [47]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = nn.ModuleList([deepcopy(
        SublayerConnection(size, dropout)) for i in range(2)])
        self.size = size

    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        output = self.sublayer[1](x, self.feed_forward)
        return output

Decoder Schema

In [49]:
  class Decoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = nn.ModuleList(
            [deepcopy(layer) for i in range(N)])
        self.norm = LayerNorm(layer.size)


    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        output = self.norm(x)
        return output

Decoder Layers

In [52]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn,
                 feed_forward, dropout):
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = nn.ModuleList([deepcopy(
        SublayerConnection(size, dropout)) for i in range(3)])

    def forward(self, x, memory, src_mask, tgt_mask):
        x = self.sublayer[0](x, lambda x:
                 self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x:
                 self.src_attn(x, memory, memory, src_mask))
        output = self.sublayer[2](x, self.feed_forward)
        return output

Genarator

In [53]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        out = self.proj(x)
        probs = nn.functional.log_softmax(out, dim=-1)
        return probs

Transformer Combining All

In [54]:
class Transformer(nn.Module):
    def __init__(self, encoder, decoder,
                 src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt),
                            memory, src_mask, tgt_mask)

    def forward(self, src, tgt, src_mask, tgt_mask):
        memory = self.encode(src, src_mask)
        output = self.decode(memory, src_mask, tgt, tgt_mask)
        return output

Final Model

In [55]:
def create_model(src_vocab, tgt_vocab, N, d_model,
                 d_ff, h, dropout=0.1):
    attn=MultiHeadedAttention(h, d_model).to(device)

    ff=PositionwiseFeedForward(d_model, d_ff, dropout).to(device)

    pos=PositionalEncoding(d_model, dropout).to(device )

    model = Transformer(
        Encoder(EncoderLayer(d_model,deepcopy(attn),deepcopy(ff),
                             dropout).to(device),N).to(device),
        Decoder(DecoderLayer(d_model,deepcopy(attn),
             deepcopy(attn),deepcopy(ff), dropout).to(device),
                N).to(device),
        nn.Sequential(Embeddings(d_model, src_vocab).to(device),
                      deepcopy(pos)),
        nn.Sequential(Embeddings(d_model, tgt_vocab).to(device),
                      deepcopy(pos)),
        Generator(d_model, tgt_vocab)).to(device)
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model.to(device)

In [57]:
model = create_model(src_vocab, trg_vocab, N=6,
    d_model=256, d_ff=1024, h=8, dropout=0.1)

Loss Functions

LOSS & OPTIMIZER

In [58]:
class LabelSmoothing(nn.Module):
    def __init__(self, size, padding_idx, smoothing=0.0):
        super().__init__()
        self.criterion = nn.KLDivLoss(reduction='sum')
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1,
               target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        output = self.criterion(x, true_dist.detach())
        return output

In [59]:
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def step(self):
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        if step is None:
            step = self._step
        output = self.factor * (self.model_size ** (-0.5) *
        min(step ** (-0.5), step * self.warmup ** (-1.5)))
        return output

In [60]:
class SimpleLossCompute:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt

    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)),
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data.item() * norm.float()

In [61]:
optimizer = NoamOpt(256, 1, 2000, torch.optim.Adam(
    model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
criterion = LabelSmoothing(trg_vocab,
                           padding_idx=0, smoothing=0.0)
loss_func = SimpleLossCompute(
            model.generator, criterion, optimizer)

In [63]:
model.load_state_dict(torch.load("files/de2en.pth"))

<All keys matched successfully>

The training Process

In [68]:
for epoch in range(50):
    model.train()
    tloss=0
    tokens=0
    for batch in batches:
        out = model(batch.src, batch.trg,
                    batch.src_mask, batch.trg_mask)
        loss = loss_func(out, batch.trg_y, batch.ntokens)
        tloss += loss
        tokens += batch.ntokens
    print(f"Epoch {epoch}, average loss: {tloss/tokens}")
torch.save(model.state_dict(),"files/de2en.pth")

Epoch 0, average loss: 7.292831897735596
Epoch 1, average loss: 4.555441856384277
Epoch 2, average loss: 3.8222193717956543
Epoch 3, average loss: 3.1755287647247314
Epoch 4, average loss: 2.642914056777954
Epoch 5, average loss: 2.3091909885406494
Epoch 6, average loss: 2.0936484336853027
Epoch 7, average loss: 1.92238450050354
Epoch 8, average loss: 1.78951895236969
Epoch 9, average loss: 1.6443781852722168
Epoch 10, average loss: 1.494490623474121
Epoch 11, average loss: 1.363502025604248
Epoch 12, average loss: 1.2579774856567383
Epoch 13, average loss: 1.1630743741989136
Epoch 14, average loss: 1.081270694732666
Epoch 15, average loss: 1.0096262693405151
Epoch 16, average loss: 0.9453006982803345
Epoch 17, average loss: 0.8964048624038696
Epoch 18, average loss: 0.8463780283927917
Epoch 19, average loss: 0.8027791380882263
Epoch 20, average loss: 0.7681121230125427
Epoch 21, average loss: 0.7279702425003052
Epoch 22, average loss: 0.6965804696083069
Epoch 23, average loss: 0.66397

In [64]:
def de2en(ger):
    tokenized_ger= [tok.text for tok in de_token.tokenizer(ger)]
    tokenized_ger=["BOS"]+tokenized_ger+["EOS"]
    geridx=[de_word_dict.get(i,UNK) for i in tokenized_ger]
    src=torch.tensor(geridx).long().to(device).unsqueeze(0)
    src_mask=(src!=0).unsqueeze(-2)
    memory=model.encode(src,src_mask)    #A
    start_symbol=en_word_dict["BOS"]
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    translation=[]
    for i in range(100):
        out = model.decode(memory,src_mask,ys,
        subsequent_mask(ys.size(1)).type_as(src.data))
        prob = model.generator(out[:, -1])    #B
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, torch.ones(1, 1).type_as(
            src.data).fill_(next_word)], dim=1)
        sym = en_idx_dict[ys[0, -1].item()]
        if sym != 'EOS':    #C
            translation.append(sym)
        else:
            break
    trans=" ".join(translation)
    for x in '''?:;.,'("-!&)%''':
        trans=trans.replace(f" {x}",f"{x}")    #D
    return trans

In [96]:
trainde[2900]

'Eine Person blickt aus dem Fenster auf einen Bus der Linie A.'

In [124]:
de2en("Ich gehe jetzt nach Hause ")

'I am going home.'

In [92]:
trainen[2900]

'A person looks out of the window on the A bus.'

29001