In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from keras.models import Sequential,Model
from keras.layers import Bidirectional,LSTM,Dense,Dropout,Activation,RepeatVector,Concatenate,Dot,Input

Using TensorFlow backend.


In [3]:
def loaddata(filename):
    file=open(filename,mode='rt',encoding='utf-8')
    text=file.read()
    file.close()
    return text

In [4]:
bilng_text=loaddata("deu.txt")

In [5]:
def pairs(text):
    lines=text.strip().split('\n')
    pairs=[line.split('\t') for line in lines]
    return pairs

In [6]:
bilng_pairs=pairs(bilng_text)

In [7]:
len(bilng_pairs)

169813

In [8]:
bilng_pairs[10]

['I ran.', 'Ich rannte.']

In [9]:
import re
from unicodedata import normalize
import string
table=str.maketrans('','',string.punctuation)
non_print=re.compile('[^%s]' % re.escape(string.printable))
def clean_text(pairs):
    cleaned=[]
    for pair in pairs:
        cleaned_pair=[]
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line=line.split()
            line=[word.lower() for word in line]
            line=[non_print.sub('',word) for word in line]
            line=[word.translate(table) for word in line]
            line=[word for word in line if word.isalpha()]
            cleaned_pair.append(' '.join(line))
        cleaned.append(cleaned_pair)
    return np.array(cleaned)

In [10]:
bilng_cleaned_pairs=clean_text(bilng_pairs)

In [11]:
len(bilng_cleaned_pairs)

169813

In [12]:
bilng_cleaned_pairs[10]

array(['i ran', 'ich rannte'], dtype='<U291')

In [13]:
'''from pickle import dump
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)'''

In [14]:
#save_clean_data(bilng_cleaned_pairs, 'english-german.pkl')

Saved: english-german.pkl


In [16]:
#using the first 10000 examples for simple model
from numpy.random import shuffle
dataset=bilng_cleaned_pairs[:20000]
shuffle(dataset)
train=dataset[:18000]
test=dataset[18000:]

In [17]:
train.shape

(18000, 2)

In [18]:
test.shape

(2000, 2)

In [19]:
from keras.preprocessing.text import Tokenizer

def create_tokenizer(lines):
    tokenizer=Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [20]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [21]:
from keras.preprocessing.sequence import pad_sequences
def encode_seq(tokenizer,length,line):
    seq=tokenizer.texts_to_sequences(line)
    seq=pad_sequences(seq,maxlen=length,padding='post')
    return seq

In [22]:
from keras.utils import to_categorical
def encode_oh(seq,vocab_size):
    y=[]
    for line in seq:
        line=to_categorical(line,num_classes=vocab_size)
        y.append(line)
    y=np.array(y)
    y=np.reshape(y,(seq.shape[0],seq.shape[1],vocab_size))
    return y

In [23]:
#prepare training data and validation data
eng_tokenizer=create_tokenizer(dataset[:,0])
ger_tokenizer=create_tokenizer(dataset[:,1])
eng_vocab_size=len(eng_tokenizer.word_index)+1
ger_vocab_size=len(ger_tokenizer.word_index)+1
eng_len=max_length(dataset[:,0])
ger_len=max_length(dataset[:,1])

In [24]:
print(eng_len)
print(ger_len)
print(eng_vocab_size)
print(ger_vocab_size)

6
10
3753
5814


In [25]:
trainX=encode_seq(ger_tokenizer,ger_len,train[:,1])
trainY=encode_seq(eng_tokenizer,eng_len,train[:,0])
trainYoh=encode_oh(trainY,eng_vocab_size)
trainXoh=encode_oh(trainX,ger_vocab_size)

In [26]:
print(trainXoh.shape)
print(trainYoh.shape)

(18000, 10, 5814)
(18000, 6, 3753)


In [27]:
print(train[0,0])
print(trainY[0])
print(train[0,1])
print(trainX[0])

who hired you
[ 59 835   4   0   0   0]
wer hat dich angestellt
[  46   10   25 2244    0    0    0    0    0    0]


In [27]:
inv_dict={v:k for k,v in eng_tokenizer.word_index.items()}

In [28]:
testX=encode_seq(ger_tokenizer,ger_len,test[:,1])
testY=encode_seq(eng_tokenizer,eng_len,test[:,0])
testYoh=encode_oh(testY,eng_vocab_size)
testXoh=encode_oh(testX,ger_vocab_size)

In [29]:
import keras.backend as K
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [32]:
Tx=ger_len
Ty=eng_len
n_a=32
n_s=64

In [33]:
#global variables for attention part of the model
repeator=RepeatVector(Tx)
concat=Concatenate(axis=-1)
dense1=Dense(10,activation='tanh')
dense2=Dense(1,activation='relu')
dense3=Activation(softmax)
dot=Dot(axes=1)

In [34]:
def one_step_attention(a,s_prev):
    s_prev=repeator(s_prev)
    X=concat([s_prev,a])
    X=dense1(X)
    X=dense2(X)
    X=dense3(X)
    context=dot([X,a])
    return context

In [35]:
#global variables for NMT model
post_attention_LSTM=LSTM(n_s,return_state=True)
output_layer=Dense(eng_vocab_size,activation=softmax)

In [36]:
def model(Tx,Ty,n_a,n_s,inp_len):
    X=Input(shape=(Tx,inp_len))
    s0=Input(shape=(n_s,),name='s0')
    c0=Input(shape=(n_s,),name='c0')
    s=s0
    c=c0
    a=Bidirectional(LSTM(n_a,return_sequences=True))(X)
    output=[]
    for _ in range(Ty):
        context=one_step_attention(a,s)
        s,_,c=post_attention_LSTM(context)
        out=output_layer(s)
        output.append(out)
    model=Model(inputs=[X,s0,c0],outputs=output)
    return model

In [37]:
model1=model(Tx,Ty,n_a,n_s,ger_vocab_size)

In [38]:
model1.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
s0 (InputLayer)                 (None, 64)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 10, 5814)     0                                            
__________________________________________________________________________________________________
repeat_vector_2 (RepeatVector)  (None, 10, 64)       0           s0[0][0]                         
                                                                 lstm_1[0][0]                     
                                                                 lstm_1[1][0]                     
                                                                 lstm_1[2][0]                     
          

In [39]:
model1.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [40]:
m=trainX.shape[0]
s0=np.zeros((m,n_s))
c0=np.zeros((m,n_s))
outputs=list(trainYoh.swapaxes(0,1))

In [41]:
model1.fit([trainXoh,s0,c0],outputs,epochs=500,verbose=2,batch_size=100)

Epoch 1/500
 - 69s - loss: 30.8921 - dense_5_loss: 3.0136 - dense_5_acc: 0.0000e+00 - dense_5_acc_1: 0.0021 - dense_5_acc_2: 0.1346 - dense_5_acc_3: 0.6193 - dense_5_acc_4: 0.9555 - dense_5_acc_5: 0.9887
Epoch 2/500
 - 20s - loss: 23.4186 - dense_5_loss: 0.7640 - dense_5_acc: 0.0000e+00 - dense_5_acc_1: 0.0021 - dense_5_acc_2: 0.1364 - dense_5_acc_3: 0.6267 - dense_5_acc_4: 0.9659 - dense_5_acc_5: 0.9998
Epoch 3/500
 - 21s - loss: 22.8419 - dense_5_loss: 0.6856 - dense_5_acc: 0.0000e+00 - dense_5_acc_1: 0.0021 - dense_5_acc_2: 0.1364 - dense_5_acc_3: 0.6267 - dense_5_acc_4: 0.9659 - dense_5_acc_5: 0.9998
Epoch 4/500
 - 21s - loss: 22.3270 - dense_5_loss: 0.6377 - dense_5_acc: 0.1858 - dense_5_acc_1: 0.0021 - dense_5_acc_2: 0.1364 - dense_5_acc_3: 0.6267 - dense_5_acc_4: 0.9659 - dense_5_acc_5: 0.9998
Epoch 5/500
 - 20s - loss: 22.1009 - dense_5_loss: 0.6225 - dense_5_acc: 0.1953 - dense_5_acc_1: 0.0021 - dense_5_acc_2: 0.1364 - dense_5_acc_3: 0.6267 - dense_5_acc_4: 0.9659 - dense_5_ac

<keras.callbacks.History at 0x1c7563a4eb8>

In [42]:
def inv_dict(tokenizer):
    inv_dict1={v:k for k,v in tokenizer.word_index.items()}
    inv_dict1[0]='<EOS>'
    return inv_dict1

In [43]:
def decode_seq_sent(inv_dict,seq):
    x=np.argmax(seq,axis=-1)
    outputs=[]
    for i in range(len(x)):
        seq=[inv_dict[ind] for ind in x[i,:]]
        outputs.append(seq)
    return outputs

In [124]:
def decode_seq_ref(inv_dict,seq):
    x=np.argmax(seq,axis=-1)
    outputs=[]
    for i in range(len(x)):
        seq=[inv_dict[ind] for ind in x[i,:]]
        outputs.append([seq])
    return outputs

In [44]:
eng_inv_dict=inv_dict(eng_tokenizer)
ger_inv_dict=inv_dict(ger_tokenizer)

In [45]:
def predict_sequence(model,inv_dict,text):
    ypred=model.predict([text,s0,c0])
    y=np.argmax(ypred,axis=-1)
    y=y.swapaxes(0,1)
    outputs=[]
    for i in range(len(y)):
        seq=[inv_dict[ind] for ind in y[i,:]]
        outputs.append(seq)
    return outputs

In [125]:
from nltk.translate.bleu_score import corpus_bleu
def evaluate_model(model,inv_dict,textX,textY):
    ypred=predict_sequence(model,inv_dict,textX)
    yact=decode_seq_ref(inv_dict,textY)
    print("score is %f"%(corpus_bleu(yact,ypred)))

In [126]:
evaluate_model(model1,eng_inv_dict,testXoh,testYoh)

score is 0.351035


In [127]:
#evaluating on train data
evaluate_model(model1,eng_inv_dict,trainXoh,trainYoh)

score is 0.918848


In [110]:
def decode_seq_ind(inv_dict,seq):
    x=np.argmax(seq,axis=-1)
    words=[inv_dict[ind] for ind in x]
    lst=' '.join(words)
    return lst

In [118]:
from nltk.translate.bleu_score import sentence_bleu
def evaluate_model2(model,inv_dict,textX,textY):
    ypred=predict_sequence(model,inv_dict,textX)
    yact=decode_seq(inv_dict,textY)
    bleu_score=0
    for i in range(len(ypred)):
        bleu_score+=sentence_bleu([yact[i]],ypred[i])
    print("score is %f"%(bleu_score/float(len(ypred))))

In [119]:
evaluate_model2(model1,eng_inv_dict,testXoh,testYoh)

score is 0.589757


In [128]:
evaluate_model2(model1,eng_inv_dict,trainXoh,trainYoh)

score is 0.936413
