<a href="https://colab.research.google.com/github/Elman295/English-to-Persian-Translator-with-LSTM/blob/main/Translator_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import random

import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T
from torch.utils.data import DataLoader, Dataset
from torch import nn



In [2]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

#`Data`

In [3]:
with zipfile.ZipFile("pes-eng.zip","r") as f:
  f.extractall("data")

In [4]:
pes_path = os.path.join("data","pes.txt")

In [5]:
file = open(pes_path, "r")

In [6]:
file.read()




In [7]:
data_csv = pd.read_csv(pes_path, sep="\t",names=["en","pr","att"])

In [8]:
del data_csv["att"]
data_csv.tail(3)

Unnamed: 0,en,pr
3227,The difference between the right word and almo...,تفاوت بین کلمهٔ صحیح و کلمهٔ تقریباً صحیح مانن...
3228,If you talk to a man in a language he understa...,اگر با یک فرد با زبانی که می‌فهمد حرف بزنید، ب...
3229,Don't lend books; no one gives them back. The ...,کتاب‌ها را امانت ندهید؛ هیچکس آنها را پس نمی‌د...


In [9]:
len(np.array(data_csv["en"]))

3230

In [10]:
tokenizer = get_tokenizer("basic_english")

In [11]:
def find_max(data):
  l = []
  for text in data:
    l.append(len(tokenizer(text)))
  return max(l)

In [12]:
find_max(data_csv["en"])

35

In [13]:
find_max(data_csv["pr"])

31

In [14]:
def token_generator(data):
  for text in data:
    yield tokenizer(text)

In [15]:
vocab_en = build_vocab_from_iterator(
    iterator = token_generator(data_csv["en"]),
    specials = ["<pad>","<sos>","<eos>","<unk>"],
    special_first = True
)
vocab_en.set_default_index(vocab_en["<unk>"])

vocab_pr = build_vocab_from_iterator(
    iterator = token_generator(data_csv["pr"]),
    specials = ["<pad>","<sos>","<eos>","<unk>"],
    special_first = True
)
vocab_pr.set_default_index(vocab_pr["<unk>"])

In [16]:
tfms_en = T.Sequential(
    T.VocabTransform(vocab_en),
    T.AddToken(1,begin = True),
    T.AddToken(2, begin = False),
    T.ToTensor(),
    T.PadTransform(max_length=37, pad_value=0)
)

tfms_pr = T.Sequential(
    T.VocabTransform(vocab_pr),
    T.AddToken(1,begin = True),
    T.AddToken(2, begin = False),
    T.ToTensor(),
    T.PadTransform(max_length=33, pad_value=0)
)

In [17]:
# def add_token_first_final(vector, sos = 1,eos = 2):
#   vector = list(vector.numpy())
#   return torch.tensor([sos] + vector + [eos])

In [18]:
text = "My name is Elman"
res = tfms_en(tokenizer(text))
res

tensor([  1,  27, 262,  11,   3,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0])

In [19]:
text = "اسم من ائلمان هست"
res = tfms_pr(tokenizer(text))
res

tensor([  1, 866,   7,   3, 813,   2,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0])

In [20]:
class EN_2_PR(Dataset):

  def __init__(self,en,pr,en_tfms,pr_tfms):
    self.en = np.array(en)
    self.pr = np.array(pr)
    self.en_T = en_tfms
    self.pr_T = pr_tfms
  def __len__(self):
    return len(self.en)

  def __getitem__(self, idx):

    en_text = self.en[idx]
    pr_text = self.pr[idx]
    en_text = self.en_T(tokenizer(en_text))
    en_pr = self.pr_T(tokenizer(pr_text))

    return en_text, en_pr



In [21]:
data = EN_2_PR(data_csv["en"], data_csv["pr"],tfms_en, tfms_pr)

In [22]:
en,pr = data[3228]
print(en.shape[0])
print(pr.shape)
print(vocab_en.lookup_tokens(list(en.numpy())))
print(vocab_pr.lookup_tokens(list(pr.numpy())))

37
torch.Size([33])
['<sos>', 'if', 'you', 'talk', 'to', 'a', 'man', 'in', 'a', 'language', 'he', 'understands', ',', 'it', 'will', 'go', 'to', 'his', 'head', '.', 'if', 'you', 'talk', 'to', 'him', 'in', 'his', 'language', ',', 'it', 'will', 'go', 'to', 'his', 'heart', '.', '<eos>']
['<sos>', 'اگر', 'با', 'یک', 'فرد', 'با', 'زبانی', 'که', 'می\u200cفهمد', 'حرف', 'بزنید،', 'به', 'مغزش', 'فرو', 'می\u200cرود', '.', 'اگر', 'با', 'زبان', 'خودش', 'با', 'او', 'حرف', 'بزنید،', 'به', 'قلبش', 'فرو', 'می\u200cرود', '.', '<eos>', '<pad>', '<pad>', '<pad>']


In [23]:
l = []
for i in range(3230):
  e,p = data[i]
  l.append(p.shape[0])

In [24]:
max(l)

33

In [25]:
train_ds, test_ds = train_test_split(data, test_size = 0.1, shuffle = True)

In [26]:
train_dl = DataLoader(dataset = train_ds, batch_size = 16, shuffle = True)
test_dl = DataLoader(dataset = test_ds, batch_size=16, shuffle = True)

In [27]:
en,pr = next(iter(train_dl))
print(en.shape)
print(pr.shape)

torch.Size([16, 37])
torch.Size([16, 33])


#`Encoder`

In [28]:
len(vocab_en)

3047

In [125]:
class Encoder(nn.Module):

  def __init__(self):
    super(Encoder, self).__init__()
    self.embed = nn.Embedding(len(vocab_en),256)
    self.dropout = nn.Dropout(0.5)
    self.lstm = nn.LSTM(256,512,batch_first = True,num_layers=2)

  def forward(self, x):
    embed = self.embed(x)
    embed_dropped = self.dropout(embed)
    output, (h,c) = self.lstm(embed_dropped)

    return h,c


In [126]:
encoder = Encoder().to(device)
r = torch.randint(high = 3000, size = (16,37)).to(device)
h2,c2 = encoder(r)
print(h2.shape)

torch.Size([2, 16, 512])


#`Decoder`

In [127]:
len(vocab_pr)

4707

In [128]:
class Decoder(nn.Module):

  def __init__(self):
    super(Decoder, self).__init__()
    self.embed = nn.Embedding(len(vocab_pr),256)
    self.dropout = nn.Dropout(0.5)
    self.mlp_0 = nn.Linear(37*512,33*256)
    self.lstm = nn.LSTM(256,512, num_layers=2, batch_first=True)
    self.mlp = nn.Linear(512,4707)

  def forward(self, x,he,ce):
    x = x.unsqueeze(1)

    embed = self.embed(x)
    embed_dropped = self.dropout(embed)

    output,(h,c) = self.lstm(embed_dropped,(he,ce))

    out = self.mlp(output)



    return out.squeeze(1),h,c

In [129]:
decoder = Decoder().to(device)
r = torch.randint(high = 3000, size = (16,)).to(device)
out,h1,c1 = decoder(r,h2,c2)
print(out.shape)
# print(out.argmax(2))
print(h1.shape)
print(c1.shape)

torch.Size([16, 4707])
torch.Size([2, 16, 512])
torch.Size([2, 16, 512])


#`Seq2Seq`

In [130]:
class Seq(nn.Module):

  def __init__(self, encoder, decoder):
    super(Seq, self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target):
    bs = target.shape[0]
    t_l = target.shape[1]
    t_dim = len(vocab_pr)
    outputs = torch.zeros(size = (t_l,bs,t_dim)).to(device)
    h,c = self.encoder(source)
    input = target[:,0]
    for t in range(1,t_l):
      output, h,c = self.decoder(input,h,c)
      outputs[t] = output
      input = output.argmax(1)
    return outputs.permute(1,0,2)








In [131]:
trans = Seq(encoder, decoder).to(device)
r1 = torch.randint(high = 3000, size = (16,37)).to(device)
r2 = torch.randint(high = 3000, size = (16,33)).to(device)
y = trans(r1,r2)
print(y.shape)

torch.Size([16, 33, 4707])


#`Loss and Optimizer`

In [132]:
opt = torch.optim.Adam(params = trans.parameters(),lr = 1e-3)

In [133]:
loss_fn = nn.CrossEntropyLoss()

#`Train LOOP`

In [134]:
len(vocab_pr)

4707

In [136]:
def train(data, model, opt, loss_fn):
  model.train()
  size = len(data.dataset)
  for b,(source, target) in enumerate(data):
    source, target = source.to(device), target.to(device)
    b = source.shape[0]

    opt.zero_grad()
    y_pred = model(source,target)
    target = target.reshape(-1)
    y_pred = y_pred.reshape(-1,len(vocab_pr))
    # print(target.shape)
    loss = loss_fn(y_pred, target)
    loss.backward()
    opt.step()


    print(f"loss:{loss.item()} [{b*len(source)} | {size}]")


In [137]:
for e in range(50):
  print(f"epoch:{e+1}=-=-=-=-=-")
  train(train_dl, trans,opt, loss_fn)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
loss:0.9316972494125366 [256 | 2907]
loss:1.2122387886047363 [256 | 2907]
loss:0.8819832801818848 [256 | 2907]
loss:1.2212135791778564 [256 | 2907]
loss:1.1237530708312988 [256 | 2907]
loss:1.092911958694458 [256 | 2907]
loss:1.053110957145691 [256 | 2907]
loss:0.9392445087432861 [256 | 2907]
loss:1.0924593210220337 [256 | 2907]
loss:0.9646024107933044 [256 | 2907]
loss:0.9099088907241821 [256 | 2907]
loss:1.2566455602645874 [256 | 2907]
loss:1.0289567708969116 [256 | 2907]
loss:0.8989477157592773 [256 | 2907]
loss:1.0539063215255737 [256 | 2907]
loss:0.988694965839386 [256 | 2907]
loss:1.1121559143066406 [256 | 2907]
loss:0.9911696910858154 [256 | 2907]
loss:0.9973812699317932 [256 | 2907]
loss:1.277005910873413 [256 | 2907]
loss:1.1027556657791138 [256 | 2907]
loss:1.0319870710372925 [256 | 2907]
loss:1.1451553106307983 [256 | 2907]
loss:0.9551585912704468 [256 | 2907]
loss:0.9694228768348694 [256 | 2907]
loss:1.0791349

In [102]:
trans.eval()

In [181]:
en,pr = test_ds[5]
print(en)
print(pr)
print(en.shape[0])
print(pr.shape)
print(vocab_en.lookup_tokens(list(en.numpy())))
print(vocab_pr.lookup_tokens(list(pr.numpy())))

tensor([   1,    6,   23, 2013,    4,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0])
tensor([   1,    7, 2023,  322,    4,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0])
37
torch.Size([33])
['<sos>', 'i', 'was', 'fired', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<sos>', 'من', 'اخراج', 'شدم', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>

In [182]:
en = en.unsqueeze(0).to(device)
pr = pr.unsqueeze(0).to(device)
y = trans(en,pr)


In [183]:
y.shape

torch.Size([1, 33, 4707])

In [184]:
y.argmax(2).shape

torch.Size([1, 33])

In [185]:
t = y.argmax(2).view(33)

In [186]:
t

tensor([   0,    7, 2023,  322,    4,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0], device='cuda:0')

In [187]:
print(vocab_pr.lookup_tokens(list(t.cpu().numpy())))

['<pad>', 'من', 'اخراج', 'شدم', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
