In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras 
import re
from collections import Counter
from tensorflow.keras.layers import TimeDistributed

In [2]:
urlOriginalSentence='/home/farrukh/Work/Datasets/UN_ru-en/en-ru/UNv1.0.en-ru.en'
urlTargetSentence='/home/farrukh/Work/Datasets/UN_ru-en/en-ru/UNv1.0.en-ru.ru'

In [3]:
def loadDataset(url,intInd,lastInd):
    sentences=[]
    with open(url,'r') as f:
        sentences=f.read().split('\n')[intInd:lastInd]
    return sentences

In [4]:
startInd=0
lastInd=40000
originalSentence=loadDataset(urlOriginalSentence,startInd,lastInd)
targetSentence=loadDataset(urlTargetSentence,startInd,lastInd)

originalEvalSentence=loadDataset(urlOriginalSentence,lastInd,lastInd+2000)
targetEvalSentence=loadDataset(urlTargetSentence,lastInd,lastInd+2000)

In [None]:
with open('/home/farrukh/Work/Datasets/UN_ru-en/en-ru/originalSentence','w') as f:
    f.write("\n".join(originalSentence))
with open('/home/farrukh/Work/Datasets/UN_ru-en/en-ru/originalEvalSentence','w') as f:
    f.write("\n".join(originalEvalSentence))
with open('/home/farrukh/Work/Datasets/UN_ru-en/en-ru/targetSentence','w') as f:
    f.write("\n".join(targetSentence))
with open('/home/farrukh/Work/Datasets/UN_ru-en/en-ru/targetEvalSentence','w') as f:
    f.write("\n".join(targetEvalSentence))

In [5]:
originalSentence=originalSentence[6:]

In [6]:
targetSentence=targetSentence[6:]

In [7]:
vocabulary_size=12000
max_length_original=50 
max_length_target=50
print(max_length_original)
print(max_length_target)

50
50


In [8]:
def cleanOriginalSentence(sentences,wordList=None, addHeaders=False,max_length=None):
    cleanedSentences=[]
    for sentence in sentences:
        sentence=re.sub('[.,\'\"?~!#@$%^&*()]+',"",sentence.lower())
        sentence=re.sub("[ ]+"," ",sentence)
        sentence=re.sub('[^a-zA-Z]+'," ",sentence)
        if wordList is not None:
            temp=[]
            for word in sentence.split(" "):
                if word in wordList:
                    temp.append(word)
                else:
                    temp.append('unk')
            if len(temp)>max_length-2:
                temp=temp[:max_length-2]
            sentence=" ".join(temp)
        sentence=' '.join([w for w in sentence.split(' ') ])
        if addHeaders==True:
            sentence='<start> ' + sentence+ ' <end>'

        cleanedSentences.append(sentence)
    return cleanedSentences

In [9]:
def cleanTargetSentence(sentences,wordList=None,addHeaders=False,max_length=None):
    cleanedSentences=[]
    for sentence in sentences:
        sentence=re.sub('[.,\'\"?~!#@$%^&*()]+',"",sentence.lower())
        sentence=re.sub("[ ]+"," ",sentence)
        sentence=re.sub('[^а-яА-Я]+'," ",sentence) 
        
        if wordList is not None:
            temp=[]
            for word in sentence.split(" "):
                if word in wordList:
                    temp.append(word)
                else:
                    temp.append('unk')
            if len(temp)>max_length-2:
                temp=temp[:max_length-2]
            sentence=" ".join(temp)
            
        sentence=' '.join([w for w in sentence.split(" ")])
        if addHeaders==True:
            sentence='<start> ' + sentence+ ' <end>'

        cleanedSentences.append(sentence)
    return cleanedSentences

In [10]:
originalSentence=cleanOriginalSentence(originalSentence)
counterOriginal=Counter(" ".join(originalSentence).split(" ")).most_common(vocabulary_size)
print(counterOriginal[-1])
counterOriginal={x:y for x,y in counterOriginal}
print(len(counterOriginal))

targetSentence=cleanTargetSentence(targetSentence)
counterTarget=Counter(" ".join(targetSentence).split(" ")).most_common(vocabulary_size)
counterTarget={x:y for x,y in counterTarget}
print(len(counterTarget))

('commercialization', 1)
12000
12000


In [11]:
originalSentence=cleanOriginalSentence(originalSentence,counterOriginal.keys(),True,max_length_original)
originalEvalSentence=cleanOriginalSentence(originalEvalSentence,counterOriginal.keys(),True,max_length_original)

In [12]:
targetSentence=cleanTargetSentence(targetSentence,counterTarget.keys(),True,max_length_target)
targetEvalSentence=cleanTargetSentence(targetEvalSentence,counterTarget.keys(),True,max_length_target)

In [13]:
def tokenize(sentences,max_length):
    tokenizer=keras.preprocessing.text.Tokenizer(filters="",)
    tokenizer.fit_on_texts(sentences)
    tensor=tokenizer.texts_to_sequences(sentences)
    tensor=keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=max_length)
    return tensor,tokenizer

In [14]:
original,originalTokenizer=tokenize(originalSentence,max_length_original)
target,targetTokenizer=tokenize(targetSentence,max_length_target)


In [15]:

originalEval=keras.preprocessing.sequence.pad_sequences(originalTokenizer.texts_to_sequences(originalEvalSentence),maxlen=max_length_original,padding='post')

targetEval=keras.preprocessing.sequence.pad_sequences(targetTokenizer.texts_to_sequences(targetEvalSentence),maxlen=max_length_target,padding='post')

In [16]:
len(originalTokenizer.index_word)

11913

In [17]:
len(targetTokenizer.index_word)

12002

In [18]:
batch_size=64
steps_per_epoch=len(original)//batch_size
embedding_dims=300
units=300
vocab_original_size=len(originalTokenizer.index_word)+1
vocab_target_size=len(targetTokenizer.index_word)+1

In [19]:
def generator(encoderInp,decoderInp,max_length,batch_size,vocab_size):
    X1,X2,y=[],[],[]
    n=0
    while 1:
        for enc,dec in zip(encoderInp,decoderInp):
            n+=1
            for i in range(1,len(dec)):
                in_seq,out_seq=dec[:i],dec[i]
                in_seq=keras.preprocessing.sequence.pad_sequences([in_seq],maxlen=max_length)[0]
                out_seq=keras.utils.to_categorical([out_seq],num_classes=vocab_size)[0]
                X1.append(enc[::-1])
                X2.append(in_seq)
                y.append(out_seq)
            if n==batch_size:
                n=0
                yield ([np.array(X1),np.array(X2)],np.array(y))
                X1,X2,y=[],[],[]


In [20]:
train_generator=generator(original,target,max_length_target,batch_size,vocab_target_size)
val_generator=generator(originalEval,targetEval,max_length_target,batch_size,vocab_target_size)

In [None]:
inputEncoder=keras.Input((max_length_original,),name='encoderInput')

embedEnc=keras.layers.Embedding(vocab_original_size,embedding_dims,name='Encoder_Embedding')
gruEnc=keras.layers.GRU(units,return_sequences=True,return_state=True,dropout=0.25,recurrent_dropout=0.25,name='Encoder_GRU1')
se1=embedEnc(inputEncoder)
se2_out,se2_hidden=gruEnc(se1) 
'''
se2_out shape= (batch_size,max_length,units)
se2_hidden shape= (batch_size,units)
'''

inputDecoder=keras.Input((max_length_target,),name='decoderInput')
embedDec=keras.layers.Embedding(vocab_target_size,embedding_dims,name='Decoder_Embedding')
gruDec=keras.layers.GRU(units,dropout=0.25,recurrent_dropout=0.25,return_sequences=True,return_state=True,name='Decoder_GRU1') ## try with statefull
denseDec=keras.layers.Dense(vocab_target_size,activation='softmax')
sd1=embedDec(inputDecoder)
sd2_out,sd2_hidden,=gruDec(sd1,initial_state=[se2_hidden])
sd3=denseDec(sd2_out[:,-1])
model=keras.Model([inputEncoder,inputDecoder],sd3)


In [None]:
encoder_model=keras.Model(inputEncoder,[se2_hidden])

decoder_state_inp=keras.Input((units,))
decoder_out,decoder_state_out,=gruDec(sd1,initial_state=[decoder_state_inp])
eval_out=denseDec(decoder_out[:,-1])
decoder_model=keras.Model([decoder_state_inp,inputDecoder],[eval_out,decoder_state_out])


In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])


In [None]:
with open('/home/farrukh/Work/Datasets/GLOVE/glove.6B.300d.txt') as f:
    lines=f.read()

In [None]:
embedding_index={}
for line in lines.split('\n'):
    values=line.split(" ")
    word=values[0]
    values=np.asarray(values[1:],dtype='float64')
    if word in originalTokenizer.word_index.keys():
        embedding_index[word]=values
    

In [None]:
embedding_matrix=np.zeros((vocab_original_size,embedding_dims),dtype='float64')

for word,i in originalTokenizer.word_index.items():
    vec=embedding_index.get(word,None)
    if vec is not None:
        embedding_matrix[i]=vec
        
        


In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable=False
del(embedding_index)
del(embedding_matrix)
del(lines)
del(counterOriginal)
del(counterTarget)

In [None]:
model.summary()

In [None]:
callback=keras.callbacks.ModelCheckpoint('/home/farrukh/Work/Machine Translation/NMT_Vanilla/NMT_VanillaGRU.hdf5',save_weights_only=True,monitor='val_loss')

In [None]:
model.

In [None]:
model.fit_generator(train_generator,steps_per_epoch=len(target)//batch_size,epochs=10,callbacks=[callback],verbose=True,validation_data=val_generator,
                    validation_steps=len(targetEval),shuffle=False)

In [None]:
batch_size=24
train_generator=generator(original,target,max_length_target,batch_size,vocab_target_size)
val_generator=generator(originalEval,targetEval,max_length_target,batch_size,vocab_target_size)
model.fit_generator(train_generator,steps_per_epoch=len(target)//batch_size,epochs=10,callbacks=[callback],verbose=True,validation_data=val_generator,
                    validation_steps=len(targetEval),shuffle=False)

In [None]:
targetTokenizer.index_word[0]='unk'

In [None]:
inp1=(original[2222])[::-1].reshape(1,-1)
inp2='<start>'
pred=[targetTokenizer.word_index[inp2]]
text=""
inp1=encoder_model.predict([inp1])
for i in range(max_length_target):
    seq=[w for w in pred]
    seq=keras.preprocessing.sequence.pad_sequences([seq],maxlen=max_length_target)
    y_hat,inp1=decoder_model.predict([inp1,seq])
    y_hat=np.argmax(y_hat)
    pred.append(y_hat)
    text=text+ " " + targetTokenizer.index_word[y_hat]
    if targetTokenizer.index_word[y_hat]=='<end>':
        break
    decoder_model.reset_states()
print(text)

In [None]:
print(" ".join(originalTokenizer.index_word[w] for w in original[2222] if w!=0))

In [None]:
print(" ".join(targetTokenizer.index_word[w] for w in target[2222] if w!=0))

In [None]:
len(targetEval[321])