In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import pickle as pkl
import numpy as np

In [3]:
with open('.//pol-eng//pol.txt','r',encoding="utf8") as f:
  data = f.read()

In [4]:
data[:6]

'Go.\tId'

# Data Preprocessing

In [5]:
uncleaned_data_list = data.split('\n')
len(uncleaned_data_list)
uncleaned_data_list[0]

'Go.\tIdź.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #7818294 (Hanyel)'

In [6]:
uncleaned_data_list = uncleaned_data_list[:40000]
len(uncleaned_data_list)

40000

In [7]:
english_word = []
pol_word = []
cleaned_data_list = []
for word in uncleaned_data_list:
  english_word.append(word.split('\t')[:-1][0])
  pol_word.append(word.split('\t')[:-1][1])

In [8]:
english_word

['Go.',
 'Hi.',
 'Run!',
 'Run.',
 'Run.',
 'Who?',
 'Wow!',
 'Wow!',
 'Fire!',
 'Fire!',
 'Fire!',
 'Help!',
 'Jump!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Begin.',
 'Begin.',
 'Do it.',
 'Hello!',
 'Hurry!',
 'I see.',
 'I see.',
 'I try.',
 'I won!',
 'Oh no!',
 'Relax.',
 'Shoot!',
 'Shoot!',
 'Smile.',
 'Attack!',
 'Cheers!',
 'Cheers!',
 'Eat it.',
 'Freeze!',
 'Freeze!',
 'Get up.',
 'Go now.',
 'Got it?',
 'Got it?',
 'He ran.',
 'Hop in.',
 'Hug me.',
 'I fell.',
 'I know.',
 'I left.',
 'I left.',
 'I left.',
 'I left.',
 'I left.',
 'I left.',
 'I lost.',
 'I quit.',
 "I'm OK.",
 'Listen.',
 'No way!',
 'Really?',
 'Thanks.',
 'Thanks.',
 'We try.',
 'Why me?',
 'Ask Tom.',
 'Ask Tom.',
 'Awesome!',
 'Be calm.',
 'Be cool.',
 'Be fair.',
 'Be kind.',
 'Be nice.',
 'Beat it.',
 'Call me.',
 'Call us.',
 'Come in.',
 'Come in.',
 'Come in.',
 '

In [9]:
pol_word

['Idź.',
 'Cześć.',
 'Uciekaj!',
 'Biegnij.',
 'Uciekaj.',
 'Kto?',
 'O, dziamdzia zaprzała jej szadź!',
 'Łał!',
 'Pali się!',
 'Strzelaj!',
 'Ognia!',
 'Pomocy!',
 'Skacz!',
 'Skok.',
 'Stój!',
 'Zatrzymaj się!',
 'Czekaj!',
 'Zaczekaj!',
 'Poczekaj!',
 'Czekajcie!',
 'Poczekajcie!',
 'Zaczekajcie!',
 'Niech pan zaczeka!',
 'Niech pani zaczeka!',
 'Czekajcie.',
 'Zaczekaj.',
 'Czekaj.',
 'Poczekaj.',
 'Poczekajcie.',
 'Zaczekajcie.',
 'Zaczynaj.',
 'Zaczynajcie.',
 'Zrób to.',
 'Cześć.',
 'Pośpiesz się!',
 'Rozumiem.',
 'Widzę.',
 'Próbuje.',
 'Wygrałem!',
 'O nie!',
 'Wyluzuj.',
 'Strzelaj!',
 'Ognia!',
 'Uśmiech.',
 'Atak!',
 'Na zdrowie!',
 'Twoje zdrowie!',
 'Zjedz to.',
 'Stój!',
 'Nie ruszaj się!',
 'Wstawaj.',
 'Idź już.',
 'Rozumiesz?',
 'Kapujesz?',
 'On pobiegł.',
 'Wskakuj.',
 'Przytul mnie.',
 'Przewróciłam się.',
 'Wiem.',
 'Wyjechałam.',
 'Odszedłem.',
 'Wyjechałam',
 'Wyszłam.',
 'Odeszłam.',
 'Wyszedłem.',
 'Przegrałem.',
 'Wychodzę.',
 'Ze mną wszystko w porządku.',


In [10]:
language_data = pd.DataFrame(columns=['English','Polish'])
language_data['English'] = english_word
language_data['Polish'] = pol_word

In [11]:
language_data

Unnamed: 0,English,Polish
0,Go.,Idź.
1,Hi.,Cześć.
2,Run!,Uciekaj!
3,Run.,Biegnij.
4,Run.,Uciekaj.
...,...,...
39995,Tom would've done that correctly the first tim...,"Tom zrobiłby to poprawnie za pierwszym razem, ..."
39996,"Tom, Mary, John and Alice were sitting around ...","Tom, Mary, John i Alice siedzieli dookoła stoł..."
39997,We consider it the citizens' legitimate right ...,Biblioteki publiczne uznajemy za podstawowe pr...
39998,We just want to ask him a few questions about ...,Chcemy tylko zadać mu kilka pytań na temat teg...


In [12]:
language_data.to_csv('language_data.csv', index=False)

In [13]:
english_text = language_data['English'].values
polish_text = language_data['Polish'].values
len(english_text), len(polish_text)

(40000, 40000)

## Data Cleaning

In [14]:
# converting all sentences to lower case
english_text = [x.lower() for x in english_text]
polish_text = [x.lower() for x in polish_text]

In [15]:
# removing inverted commas
english_text = [re.sub("'",'',x) for x in english_text]
polish_text = [re.sub("'",'',x) for x in polish_text]

In [16]:
english_text

['go.',
 'hi.',
 'run!',
 'run.',
 'run.',
 'who?',
 'wow!',
 'wow!',
 'fire!',
 'fire!',
 'fire!',
 'help!',
 'jump!',
 'jump.',
 'stop!',
 'stop!',
 'wait!',
 'wait!',
 'wait!',
 'wait!',
 'wait!',
 'wait!',
 'wait!',
 'wait!',
 'wait.',
 'wait.',
 'wait.',
 'wait.',
 'wait.',
 'wait.',
 'begin.',
 'begin.',
 'do it.',
 'hello!',
 'hurry!',
 'i see.',
 'i see.',
 'i try.',
 'i won!',
 'oh no!',
 'relax.',
 'shoot!',
 'shoot!',
 'smile.',
 'attack!',
 'cheers!',
 'cheers!',
 'eat it.',
 'freeze!',
 'freeze!',
 'get up.',
 'go now.',
 'got it?',
 'got it?',
 'he ran.',
 'hop in.',
 'hug me.',
 'i fell.',
 'i know.',
 'i left.',
 'i left.',
 'i left.',
 'i left.',
 'i left.',
 'i left.',
 'i lost.',
 'i quit.',
 'im ok.',
 'listen.',
 'no way!',
 'really?',
 'thanks.',
 'thanks.',
 'we try.',
 'why me?',
 'ask tom.',
 'ask tom.',
 'awesome!',
 'be calm.',
 'be cool.',
 'be fair.',
 'be kind.',
 'be nice.',
 'beat it.',
 'call me.',
 'call us.',
 'come in.',
 'come in.',
 'come in.',
 'c

### Removing Punctuations

In [17]:
def remove_punc(text_list):
    table = str.maketrans('', '', string.punctuation)
    removed_punc_text = []
    for sent in text_list:
        sentence = [word.translate(table) for word in sent.split(' ')]
        removed_punc_text.append(' '.join(sentence))
    return removed_punc_text

In [18]:
english_text = remove_punc(english_text)
polish_text = remove_punc(polish_text)


In [19]:
english_text

['go',
 'hi',
 'run',
 'run',
 'run',
 'who',
 'wow',
 'wow',
 'fire',
 'fire',
 'fire',
 'help',
 'jump',
 'jump',
 'stop',
 'stop',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'begin',
 'begin',
 'do it',
 'hello',
 'hurry',
 'i see',
 'i see',
 'i try',
 'i won',
 'oh no',
 'relax',
 'shoot',
 'shoot',
 'smile',
 'attack',
 'cheers',
 'cheers',
 'eat it',
 'freeze',
 'freeze',
 'get up',
 'go now',
 'got it',
 'got it',
 'he ran',
 'hop in',
 'hug me',
 'i fell',
 'i know',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i lost',
 'i quit',
 'im ok',
 'listen',
 'no way',
 'really',
 'thanks',
 'thanks',
 'we try',
 'why me',
 'ask tom',
 'ask tom',
 'awesome',
 'be calm',
 'be cool',
 'be fair',
 'be kind',
 'be nice',
 'beat it',
 'call me',
 'call us',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come on',
 'come o

In [20]:
polish_text

['idź',
 'cześć',
 'uciekaj',
 'biegnij',
 'uciekaj',
 'kto',
 'o dziamdzia zaprzała jej szadź',
 'łał',
 'pali się',
 'strzelaj',
 'ognia',
 'pomocy',
 'skacz',
 'skok',
 'stój',
 'zatrzymaj się',
 'czekaj',
 'zaczekaj',
 'poczekaj',
 'czekajcie',
 'poczekajcie',
 'zaczekajcie',
 'niech pan zaczeka',
 'niech pani zaczeka',
 'czekajcie',
 'zaczekaj',
 'czekaj',
 'poczekaj',
 'poczekajcie',
 'zaczekajcie',
 'zaczynaj',
 'zaczynajcie',
 'zrób to',
 'cześć',
 'pośpiesz się',
 'rozumiem',
 'widzę',
 'próbuje',
 'wygrałem',
 'o nie',
 'wyluzuj',
 'strzelaj',
 'ognia',
 'uśmiech',
 'atak',
 'na zdrowie',
 'twoje zdrowie',
 'zjedz to',
 'stój',
 'nie ruszaj się',
 'wstawaj',
 'idź już',
 'rozumiesz',
 'kapujesz',
 'on pobiegł',
 'wskakuj',
 'przytul mnie',
 'przewróciłam się',
 'wiem',
 'wyjechałam',
 'odszedłem',
 'wyjechałam',
 'wyszłam',
 'odeszłam',
 'wyszedłem',
 'przegrałem',
 'wychodzę',
 'ze mną wszystko w porządku',
 'słuchaj',
 'nie mam mowy',
 'naprawdę',
 'dziękuję',
 'dzięki',
 '

In [21]:
clear_digits = str.maketrans('', '', digits)
clear_digits_text = []

In [22]:
for sent in english_text:
    sentence = [word.translate(clear_digits) for word in sent.split(' ')]
    clear_digits_text.append(' '.join(sentence))

In [23]:
clear_digits_text

['go',
 'hi',
 'run',
 'run',
 'run',
 'who',
 'wow',
 'wow',
 'fire',
 'fire',
 'fire',
 'help',
 'jump',
 'jump',
 'stop',
 'stop',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'begin',
 'begin',
 'do it',
 'hello',
 'hurry',
 'i see',
 'i see',
 'i try',
 'i won',
 'oh no',
 'relax',
 'shoot',
 'shoot',
 'smile',
 'attack',
 'cheers',
 'cheers',
 'eat it',
 'freeze',
 'freeze',
 'get up',
 'go now',
 'got it',
 'got it',
 'he ran',
 'hop in',
 'hug me',
 'i fell',
 'i know',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i lost',
 'i quit',
 'im ok',
 'listen',
 'no way',
 'really',
 'thanks',
 'thanks',
 'we try',
 'why me',
 'ask tom',
 'ask tom',
 'awesome',
 'be calm',
 'be cool',
 'be fair',
 'be kind',
 'be nice',
 'beat it',
 'call me',
 'call us',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come on',
 'come o

In [24]:
polish_text = [re.sub("[1234567890]","",x) for x in polish_text]

In [25]:
polish_text = [re.sub("[\u200d]","",x) for x in polish_text]

In [26]:
# triming the sentences in both the texts
english_text = [x.strip() for x in english_text]
polish_text = [x.strip() for x in polish_text]
english_text

['go',
 'hi',
 'run',
 'run',
 'run',
 'who',
 'wow',
 'wow',
 'fire',
 'fire',
 'fire',
 'help',
 'jump',
 'jump',
 'stop',
 'stop',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'wait',
 'begin',
 'begin',
 'do it',
 'hello',
 'hurry',
 'i see',
 'i see',
 'i try',
 'i won',
 'oh no',
 'relax',
 'shoot',
 'shoot',
 'smile',
 'attack',
 'cheers',
 'cheers',
 'eat it',
 'freeze',
 'freeze',
 'get up',
 'go now',
 'got it',
 'got it',
 'he ran',
 'hop in',
 'hug me',
 'i fell',
 'i know',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i left',
 'i lost',
 'i quit',
 'im ok',
 'listen',
 'no way',
 'really',
 'thanks',
 'thanks',
 'we try',
 'why me',
 'ask tom',
 'ask tom',
 'awesome',
 'be calm',
 'be cool',
 'be fair',
 'be kind',
 'be nice',
 'beat it',
 'call me',
 'call us',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come in',
 'come on',
 'come o

In [27]:
# adding start and end tag to polish sentences so the decoder understands when to start and when to stop
polish_text = ["start " + x + " end" for x in polish_text]

In [28]:
polish_text[0]

'start idź end'

In [29]:
english_text[0],polish_text[0]

('go', 'start idź end')

In [30]:
X = english_text
y = polish_text

In [31]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.1,train_size=0.9,random_state=0)

In [32]:
# determining the maximum length of sentence for pad sequences for english
max_english_text = max([len(x.split(' ')) for x in X_train])
max_english_text

17

In [33]:
# determining the maximum length of sentence for pad sequences for polish
max_polish_text = max([len(x.split(' ')) for x in y_train])
max_polish_text

20

In [34]:
# similarly we find it for the test set as well
max_test_english_text = max([len(x.split(' ')) for x in X_test])
max_test_english_text

17

In [35]:
max_test_polish_text = max([len(x.split(' ')) for x in y_test])
max_test_polish_text

16

### Tokenizing the data

In [36]:
# english text
tokenize_eng = Tokenizer()
tokenize_eng.fit_on_texts(X_train)

In [37]:
word2index_eng = tokenize_eng.word_index

In [38]:
vocab_size_eng = len(word2index_eng) + 1
vocab_size_eng

7921

In [39]:
X_train = tokenize_eng.texts_to_sequences(X_train)

In [40]:
X_train = pad_sequences(X_train, maxlen=max_english_text, padding='post')

In [41]:
X_test = tokenize_eng.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_english_text, padding='post')

In [42]:
# tokenizing polish text
tokenize_pol = Tokenizer()
tokenize_pol.fit_on_texts(y_train)
word2index_pol = tokenize_pol.word_index
vocab_size_pol = len(word2index_pol) + 1
vocab_size_pol

19516

In [43]:
y_train = tokenize_pol.texts_to_sequences(y_train)
y_train = pad_sequences(y_train, maxlen=max_polish_text, padding='post')
y_test = tokenize_pol.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, maxlen = max_polish_text, padding='post')

In [44]:
X_train[0]

array([  37,  111, 1530,    4, 1644,   14,    6,  881,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [45]:
y_train[0]

array([   1,    8,  966, 1182, 1567,    5, 6164,    2,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0])

In [46]:
with open('NMT_data.pkl','wb') as f:
  pkl.dump([X_train, y_train, X_test, y_test],f)
with open('NMT_Etokenizer.pkl','wb') as f:
  pkl.dump([vocab_size_eng, word2index_eng, tokenize_eng], f)
with open('NMT_Ptokenizer.pkl', 'wb') as f:
  pkl.dump([vocab_size_pol, word2index_pol, tokenize_pol], f)

In [47]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [48]:
import torch

In [49]:
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.layers import RepeatVector

In [51]:
from attention import AttentionLayer 

In [52]:

from tensorflow.keras import backend as K 
K.clear_session() 
latent_dim = 500
# Encoder 
encoder_inputs = Input(shape=(max_english_text,)) 
enc_emb = Embedding(vocab_size_eng, latent_dim,trainable=True)(encoder_inputs)
#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(vocab_size_pol, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs)
#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])
#Attention Layer
attn_layer = AttentionLayer(name="attention") 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
#Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size_pol, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input)
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 

In [53]:
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [54]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [55]:
history = model.fit([X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1],1)[:,1:], 
                    epochs=50, 
                    callbacks=[es],
                    batch_size=512,
                    validation_data = ([X_test, y_test[:,:-1]], y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:,1:]))

Epoch 1/50
13/71 [====>.........................] - ETA: 17:54 - loss: 5.7956 - accuracy: 0.5225

KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot 
pyplot.plot(history.history['loss'], label='train') 
pyplot.plot(history.history['val_loss'], label='test') 
pyplot.legend() 
pyplot.show()

In [None]:
model_json = model.to_json()
with open("NMT_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("NMT_model_weight.h5")
print("Saved model to disk")

In [None]:
# loading the model architecture and asigning the weights
json_file = open('NMT_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model_loaded = model_from_json(loaded_model_json, custom_objects={'AttentionLayer': AttentionLayer})
# load weights into new model
model_loaded.load_weights("NMT_model_weight.h5")