In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
def setSeed(seed):
  torch.manual_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
  torch.cuda.manual_seed(seed)


In [3]:
setSeed(1234)

In [5]:
!pip3 install https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.0.0/de_core_news_sm-3.0.0.tar.gz (19.3MB)
[K     |████████████████████████████████| 19.3MB 1.5MB/s 
[?25hCollecting spacy<3.1.0,>=3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/1b/d8/0361bbaf7a1ff56b44dca04dace54c82d63dad7475b7d25ea1baefafafb2/spacy-3.0.6-cp37-cp37m-manylinux2014_x86_64.whl (12.8MB)
[K     |████████████████████████████████| 12.8MB 233kB/s 
[?25hCollecting spacy-legacy<3.1.0,>=3.0.4
  Downloading https://files.pythonhosted.org/packages/8d/67/d4002a18e26bf29b17ab563ddb55232b445ab6a02f97bf17d1345ff34d3f/spacy_legacy-3.0.5-py2.py3-none-any.whl
Collecting pydantic<1.8.0,>=1.7.1
[?25l  Downloading https://files.pythonhosted.org/packages/ca/fa/d43f31874e1f2a9633e4c025be310f2ce7a8350017579e9e837a62630a7e/pydantic-1.7.4-cp37-cp37m-manylin

In [6]:
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz (13.7MB)
[K     |████████████████████████████████| 13.7MB 221kB/s 
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-sm: filename=en_core_web_sm-3.0.0-cp37-none-any.whl size=13704313 sha256=a6a89e534c0726b163cf2e896d0903e5238fa16e778fd3ef07d81fd5c74d72c5
  Stored in directory: /root/.cache/pip/wheels/91/2b/a1/d83336e8dfaacbbcdfc805b2c7195dd3ea10d507396fe31cac
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
  Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.0.0


In [4]:
en=spacy.load('en_core_web_sm')
de=spacy.load('de_core_news_sm')

In [5]:
def tokenize_en(text):
  return [tok.text for tok in en.tokenizer(text)]

def tokenize_de(text):
  return [tok.text for tok in de.tokenizer(text)]  

In [None]:
tokenize_en('hello my friend')

['hello', 'my', 'friend']

In [6]:
SRC=Field(init_token='sos',eos_token='eos',lower=True, tokenize=tokenize_en)
TRG=Field(init_token='sos',eos_token='eos',lower=True, tokenize=tokenize_de)

In [7]:
train_data,valid_data,test_data=Multi30k.splits(exts=('.en','.de'),fields=(SRC,TRG))

In [18]:
SRC.build_vocab(train_data,min_freq=2)
TRG.build_vocab(train_data,min_freq=2)

In [9]:
device=torch.device('cuda') if torch.cuda.is_available()else torch.device('cpu')

In [10]:
train_loader,valid_loader,test_loader=BucketIterator.splits((train_data,valid_data,test_data),batch_size=128,device=device)

In [11]:
len(train_loader)

227

In [11]:
class Encoder(nn.Module):
  def __init__(self, embed_size, enc_hid_size,dec_hid_size,input_dim,dropout):
    super().__init__()
    self.embedding=nn.Embedding(input_dim,embed_size)
    self.rnn=nn.GRU(embed_size,hidden_size=enc_hid_size,bidirectional=True,dropout=dropout)
    self.dropout=nn.Dropout(dropout)
    self.fc=nn.Linear(enc_hid_size*2,dec_hid_size)
    self.enc_hid_size=enc_hid_size
    self.dec_hid_size=dec_hid_size
  def forward(self,input):
    embed=self.dropout(self.embedding(input))
    #output of shape (seq_len, batch, num_directions * hidden_size)
    #hidden of shape (num_layers * num_directions, batch, hidden_size)(last hidden state)
    output,hidden=self.rnn(embed)
    hidden=self.fc(torch.cat((hidden[-2,:,:],hidden[-1,:,:]),dim=1))
    return output,hidden

In [12]:
class Attention(nn.Module):
  def __init__(self,enc_hid_size,dec_hid_size):
      super().__init__()
      self.att=nn.Linear((enc_hid_size * 2) + dec_hid_size,dec_hid_size)
      self.v=nn.Linear(dec_hid_size,1,bias=False)
  def forward(self,encoder_outputs,hidden):
    hidden=hidden.unsqueeze(1)
    seq_len=encoder_outputs.shape[0]
    hidden=hidden.repeat(1,seq_len,1)
    energy=torch.tanh(self.att(torch.cat((encoder_outputs.permute(1,0,2),hidden),dim=2)))
    weighted=self.v(energy)
    ##shape of [batch  seq_len  1]
    return nn.functional.softmax(weighted,dim=1)

    

In [13]:
class Decoder(nn.Module):
  def __init__(self,embed_size,enc_hid_size,dec_hid_size,output_dim,dropout,attn):
    super().__init__()
    self.output_dim=output_dim
    self.attn=attn
    self.embedding=nn.Embedding(output_dim,embed_size)
    self.rnn=nn.GRU(embed_size+(2*enc_hid_size),hidden_size=dec_hid_size,dropout=dropout)
    self.fc=nn.Linear(embed_size+(2*enc_hid_size)+dec_hid_size,output_dim)
    self.dropout=nn.Dropout(dropout)
  def forward(self,input,encoder_outputs,hidden):
    #shape of (batch   seq_len      hidden_size*2)

    input=input.unsqueeze(0)
    embed=self.dropout(self.embedding(input))
    #shape of [batch  1  seq_len]
    att= (self.attn(encoder_outputs,hidden))
    ##shape of [batch     2*enc_hid_size   1]
    w=torch.bmm(encoder_outputs.permute(1,2,0),att)
    #embed of shape[1 batch     embedsize]
    #hidden of shape([1, batch,     hiddensize])
    hidden =hidden.unsqueeze(0)
    output, hidden=self.rnn(torch.cat((embed.squeeze(0),w.squeeze(2)),dim=1).unsqueeze(0),hidden)
    #hidden=hidden.unsqueeze(0)
    ##shape of [1 batch output_dim]
    predictions=self.fc(torch.cat((hidden,embed,w.permute(2,0,1)),dim=-1))
    return predictions.squeeze(0),hidden.squeeze(0)





In [14]:
class seq2seq(nn.Module):
  def __init__(self,encoder,decoder,device):
     super().__init__()
     self.encoder=encoder
     self.decoder=decoder
     self.device=device
  def forward(self,src,trg,teacher_force=0.5):
    encoder_outputs,hidden=self.encoder(src)  
    seqlen=trg.shape[0]
    batch=src.shape[1]
    output_dim=self.decoder.output_dim
    input=trg[0]
    predict=torch.zeros((seqlen,batch,output_dim),device=self.device)
    pred,hidden=self.decoder(input,encoder_outputs,hidden)
    for i in range(1,seqlen):
      predict[i,:,:]=pred
      if random.random()<teacher_force:
        input=trg[i]
      else:
        input=pred.argmax(-1)
      pred,hidden=self.decoder(input,encoder_outputs,hidden)
    return predict


In [19]:
input_dim,output_dim=len(SRC.vocab),len(TRG.vocab)
embed_size=256
enc_hid_size,dec_hid_size=512,512
dropout=0.5
encoder=Encoder(embed_size,enc_hid_size,dec_hid_size,input_dim,dropout)
attn=Attention(enc_hid_size,dec_hid_size)

decoder=Decoder(embed_size,enc_hid_size,dec_hid_size,output_dim,dropout,attn)

model=seq2seq(encoder,decoder,device).to(device=device)

optimizor=optim.Adam(model.parameters())
pad=TRG.vocab.stoi[TRG.pad_token]
criterion=nn.CrossEntropyLoss(ignore_index=pad)


  "num_layers={}".format(dropout, num_layers))


In [30]:
model

seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attn): Attention(
      (att): Linear(in_features=1024, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(7853, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (fc): Linear(in_features=1792, out_features=7853, bias=True)
  )
)

In [20]:
def init_weight(model):
  for name,param in model.named_parameters():
    nn.init.uniform(param.data,-0.08,0.08)

model.apply(init_weight)

  This is separate from the ipykernel package so we can avoid doing imports until


seq2seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 256)
    (rnn): GRU(256, 512, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attn): Attention(
      (att): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(7853, 256)
    (rnn): GRU(1280, 512, dropout=0.5)
    (fc): Linear(in_features=1792, out_features=7853, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [21]:
def train(model,train_loader,optimizor,criterion,clip=1):
  model.train()
  total_loss=0
  for i , batch in enumerate(train_loader):
    optimizor.zero_grad()
    predictions=model(batch.src,batch.trg)
    output_dim=predictions.shape[2]
    predictions,trg=predictions.view(-1,output_dim),batch.trg.view(-1)
    loss=criterion(predictions,trg)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizor.step()
    total_loss+=loss
  return total_loss/len(train_loader),torch.exp(total_loss/len(train_loader))


In [None]:
train(model,train_loader,optimizor,criterion)

In [23]:
def evaluate(model,valid_loader,criterion):
  model.eval()
  total_loss=0
  for i , batch in enumerate(valid_loader):
   
    predictions=model(batch.src,batch.trg,0)
    output_dim=predictions.shape[2]
    predictions,trg=predictions.view(-1,output_dim),batch.trg.view(-1)
    loss=criterion(predictions,trg)
   
    total_loss+=loss
  return total_loss/len(valid_loader),torch.exp(total_loss/len(valid_loader))


In [24]:
best_loss=float('inf')
for epoch in range(4):
  loss_train,prep_train=train(model,train_loader,optimizor,criterion)
  loss_val,prep_val=evaluate(model,valid_loader,criterion)
  if loss_val<best_loss:
    best_loss=loss_val
    torch.save(model.state_dict(),'seq2seq.pt')
  print(f'\tTrain Loss: {loss_train:.3f} | Train PPL: {prep_train:7.3f}')
  print(f'\t Val. Loss: {loss_val:.3f} |  Val. PPL: {prep_val:7.3f}')



	Train Loss: 4.810 | Train PPL: 122.694
	 Val. Loss: 4.187 |  Val. PPL:  65.795
	Train Loss: 3.548 | Train PPL:  34.740
	 Val. Loss: 3.686 |  Val. PPL:  39.885
	Train Loss: 3.018 | Train PPL:  20.447
	 Val. Loss: 3.447 |  Val. PPL:  31.421
	Train Loss: 2.650 | Train PPL:  14.153
	 Val. Loss: 3.425 |  Val. PPL:  30.732
