In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras 
import re
from collections import Counter
from tensorflow.keras.layers import TimeDistributed

In [2]:
urlOriginalSentence='/home/farrukh/Work/Machine Translation/Datasets/en-ru/original'
urlTargetSentence='/home/farrukh/Work/Machine Translation/Datasets/en-ru/target'

In [3]:
def loadDataset(url,intInd,lastInd):
    sentences=[]
    with open(url,'r') as f:
        sentences=f.read().split('\n')[intInd:lastInd]
    return sentences

In [4]:
startInd=0
lastInd=25000
originalSentence=loadDataset(urlOriginalSentence,startInd,lastInd)
targetSentence=loadDataset(urlTargetSentence,startInd,lastInd)

originalEvalSentence=loadDataset(urlOriginalSentence,lastInd,lastInd+1000)
targetEvalSentence=loadDataset(urlTargetSentence,lastInd,lastInd+1000)

In [5]:
originalSentence=originalSentence[6:]

In [6]:
targetSentence=targetSentence[6:]

In [7]:
max_length_original=50 
max_length_target=50
print(max_length_original)
print(max_length_target)

50
50


In [8]:
def cleanOriginalSentence(sentences,wordList=None, addHeaders=False,max_length=None):
    cleanedSentences=[]
    for sentence in sentences:
        sentence=re.sub('[.,\'\"?~!#@$%^&*()]+',"",sentence.lower())
        sentence=re.sub("[ ]+"," ",sentence)
        sentence=re.sub('[^a-zA-Z]+'," ",sentence)
        if wordList is not None:
            temp=[]
            for word in sentence.split(" "):
                if word in wordList:
                    temp.append(word)
                else:
                    temp.append('unk')
            if len(temp)>max_length-2:
                temp=temp[:max_length-2]
            sentence=" ".join(temp)
        sentence=' '.join([w for w in sentence.split(' ') if len(w)>1 ])
        if addHeaders==True:
            sentence='<start> ' + sentence+ ' <end>'

        cleanedSentences.append(sentence)
    return cleanedSentences

In [9]:
def cleanTargetSentence(sentences,wordList=None,addHeaders=False,max_length=None):
    cleanedSentences=[]
    for sentence in sentences:
        sentence=re.sub('[.,\'\"?~!#@$%^&*()]+',"",sentence.lower())
        sentence=re.sub("[ ]+"," ",sentence)
        sentence=re.sub('[^а-яА-Я]+'," ",sentence) 
        
        if wordList is not None:
            temp=[]
            for word in sentence.split(" "):
                if word in wordList:
                    temp.append(word)
                else:
                    temp.append('unk')
            if len(temp)>max_length-2:
                temp=temp[:max_length-2]
            sentence=" ".join(temp)
            
        sentence=' '.join([w for w in sentence.split(" ")])
        if addHeaders==True:
            sentence='<start> ' + sentence+ ' <end>'

        cleanedSentences.append(sentence)
    return cleanedSentences

In [10]:
originalSentence=cleanOriginalSentence(originalSentence)
counterOriginal=Counter(" ".join(originalSentence).split(" "))
counterOriginal={word:k for word,k in counterOriginal.items() if k>2}
print(len(counterOriginal))

targetSentence=cleanTargetSentence(targetSentence)
counterTarget=Counter(" ".join(targetSentence).split(" "))
counterTarget={word:k for word,k in counterTarget.items() if k>1}
print(len(counterTarget))

6580
16512


In [11]:
originalSentence=cleanOriginalSentence(originalSentence,counterOriginal.keys(),True,max_length_original)
originalEvalSentence=cleanOriginalSentence(originalEvalSentence,counterOriginal.keys(),True,max_length_original)

In [12]:
targetSentence=cleanTargetSentence(targetSentence,counterTarget.keys(),True,max_length_target)
targetEvalSentence=cleanTargetSentence(targetEvalSentence,counterTarget.keys(),True,max_length_target)

In [13]:
def tokenize(sentences,max_length):
    tokenizer=keras.preprocessing.text.Tokenizer(filters="",)
    tokenizer.fit_on_texts(sentences)
    tensor=tokenizer.texts_to_sequences(sentences)
    tensor=keras.preprocessing.sequence.pad_sequences(tensor,padding='post',maxlen=max_length)
    return tensor,tokenizer

In [14]:
original,originalTokenizer=tokenize(originalSentence,max_length_original)
target,targetTokenizer=tokenize(targetSentence,max_length_target)


In [16]:

originalEval=keras.preprocessing.sequence.pad_sequences(originalTokenizer.texts_to_sequences(originalEvalSentence),maxlen=max_length_original,padding='post')

targetEval=keras.preprocessing.sequence.pad_sequences(targetTokenizer.texts_to_sequences(targetEvalSentence),maxlen=max_length_target,padding='post')

In [17]:
batch_size=24
steps_per_epoch=len(original)//batch_size
embedding_dims=300
units=300
vocab_original_size=len(originalTokenizer.index_word)+1
vocab_target_size=len(targetTokenizer.index_word)+1

In [18]:
vocab_original_size

6583

In [19]:
vocab_target_size

16487

In [20]:
def generator(encoderInp,decoderInp,max_length,batch_size,vocab_size):
    X1,X2,y=[],[],[]
    n=0
    while 1:
        for enc,dec in zip(encoderInp,decoderInp):
            n+=1
            for i in range(1,len(dec)):
                in_seq,out_seq=dec[:i],dec[i]
                in_seq=keras.preprocessing.sequence.pad_sequences([in_seq],maxlen=max_length)[0]
                out_seq=keras.utils.to_categorical([out_seq],num_classes=vocab_size)[0]
                X1.append(enc[::-1])
                X2.append(in_seq)
                y.append(out_seq)
            if n==batch_size:
                n=0
                yield ([np.array(X1),np.array(X2)],np.array(y))
                X1,X2,y=[],[],[]


In [21]:
train_generator=generator(original,target,max_length_target,batch_size,vocab_target_size)
val_generator=generator(originalEval,targetEval,max_length_target,batch_size,vocab_target_size)

In [22]:
inputEncoder=keras.Input((max_length_original,),name='encoderInput')

embedEnc=keras.layers.Embedding(vocab_original_size,embedding_dims,name='Encoder_Embedding')
gruEnc=keras.layers.GRU(units,return_sequences=True,return_state=True,dropout=0.25,recurrent_dropout=0.25,name='Encoder_GRU1')
se1=embedEnc(inputEncoder)
se2_out,se2_hidden=gruEnc(se1) 
'''
se2_out shape= (batch_size,max_length,units)
se2_hidden shape= (batch_size,units)
'''

inputDecoder=keras.Input((max_length_target,),name='decoderInput')
embedDec=keras.layers.Embedding(vocab_target_size,embedding_dims,name='Decoder_Embedding')
gruDec=keras.layers.GRU(units,dropout=0.25,recurrent_dropout=0.25,return_sequences=True,return_state=True,name='Decoder_GRU1') ## try with statefull
denseDec=keras.layers.Dense(vocab_target_size,activation='softmax')
sd1=embedDec(inputDecoder)
sd2_out,sd2_hidden,=gruDec(sd1,initial_state=[se2_hidden])
sd3=denseDec(sd2_out[:,-1])
model=keras.Model([inputEncoder,inputDecoder],sd3)


W0304 13:46:10.360821 140243442886400 deprecation.py:506] From /home/farrukh/anaconda3/envs/tensorEnv/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0304 13:46:10.363305 140243442886400 deprecation.py:506] From /home/farrukh/anaconda3/envs/tensorEnv/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [23]:
encoder_model=keras.Model(inputEncoder,[se2_hidden])

decoder_state_inp=keras.Input((units,))
decoder_out,decoder_state_out,=gruDec(sd1,initial_state=[decoder_state_inp])
eval_out=denseDec(decoder_out[:,-1])
decoder_model=keras.Model([decoder_state_inp,inputDecoder],[eval_out,decoder_state_out])


In [24]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])


In [25]:
with open('/home/farrukh/Work/Image Caption/Glove/glove.6B.300d.txt') as f:
    lines=f.read()

In [26]:
embedding_index={}
for line in lines.split('\n'):
    values=line.split(" ")
    word=values[0]
    values=np.asarray(values[1:],dtype='float64')
    if word in originalTokenizer.word_index.keys():
        embedding_index[word]=values
    

In [27]:
embedding_matrix=np.zeros((vocab_original_size,embedding_dims),dtype='float64')

for word,i in originalTokenizer.word_index.items():
    vec=embedding_index.get(word,None)
    if vec is not None:
        embedding_matrix[i]=vec
        
        


In [28]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable=False
del(embedding_index)
del(embedding_matrix)
del(lines)
del(counterOriginal)
del(counterTarget)

In [29]:
model.summary()

W0304 13:48:42.713305 140243442886400 training.py:2197] Discrepancy between trainable weights and collected trainable weights, did you set `model.trainable` without calling `model.compile` after ?


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoderInput (InputLayer)       [(None, 50)]         0                                            
__________________________________________________________________________________________________
decoderInput (InputLayer)       [(None, 50)]         0                                            
__________________________________________________________________________________________________
Encoder_Embedding (Embedding)   (None, 50, 300)      1974900     encoderInput[0][0]               
__________________________________________________________________________________________________
Decoder_Embedding (Embedding)   (None, 50, 300)      4946100     decoderInput[0][0]               
______________________________________________________________________________________________

In [28]:
callback=keras.callbacks.ModelCheckpoint('/home/farrukh/Work/Machine Translation/NMT_simple.hdf5',save_weights_only=True,monitor='val_loss')

In [45]:
model.load_weights('/home/farrukh/Work/Machine Translation/NMT_simple2.hdf5')

In [None]:
model.fit_generator(train_generator,steps_per_epoch=len(target)//batch_size,epochs=10,callbacks=[callback],verbose=True,validation_data=val_generator,
                    validation_steps=len(targetEval),shuffle=False)

In [30]:
batch_size=24
train_generator=generator(original,target,max_length_target,batch_size,vocab_target_size)
val_generator=generator(originalEval,targetEval,max_length_target,batch_size,vocab_target_size)
model.fit_generator(train_generator,steps_per_epoch=len(target)//batch_size,epochs=10,callbacks=[callback],verbose=True,validation_data=val_generator,
                    validation_steps=len(targetEval),shuffle=False)

W0224 02:30:42.032688 139999204845312 training.py:2197] Discrepancy between trainable weights and collected trainable weights, did you set `model.trainable` without calling `model.compile` after ?


Epoch 1/10


W0224 02:30:42.269499 139999204845312 deprecation.py:323] From /home/farrukh/anaconda3/envs/tensorEnv/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: 

In [46]:
targetTokenizer.index_word[0]='unk'

In [54]:
inp1=(originalEval[3])[::-1].reshape(1,-1)
inp2='<start>'
pred=[targetTokenizer.word_index[inp2]]
text=""
inp1=encoder_model.predict([inp1])
for i in range(max_length_target):
    seq=[w for w in pred]
    seq=keras.preprocessing.sequence.pad_sequences([seq],maxlen=max_length_target)
    y_hat,inp1=decoder_model.predict([inp1,seq])
    y_hat=np.argmax(y_hat)
    pred.append(y_hat)
    text=text+ " " + targetTokenizer.index_word[y_hat]
    if targetTokenizer.index_word[y_hat]=='<end>':
        break
    decoder_model.reset_states()
print(text)

 кроме того правительство австралии весьма unk и что правительство квазулу состоит из сторон сама по себе и пересмотр членского состава совета безопасности <end>


In [55]:
targetEvalSentence[3]

'<start>  кроме того правительство unk предлагает чтобы новые члены совета unk региональными группами и при этом проводилась ротация <end>'

In [56]:
originalEvalSentence[3]

'<start> in addition the government of madagascar proposes that the new members of the council should be elected by regional group and by rotation <end>'

In [70]:
len(targetEval[321])

50

In [None]:
y_hat