In [1]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from tf.app import use
A = use('bhsa', hoist=globals())
A.displaySetup(extraFeatures='g_cons')

TF app is up-to-date.
Using annotation/app-bhsa commit 43c1c5e88b371f575cdbbf57e38167deb8725f7f (=latest)
  in C:\Users\geitb/text-fabric-data/__apps__/bhsa.
Using etcbc/bhsa/tf - c r1.5 in C:\Users\geitb/text-fabric-data
Using etcbc/phono/tf - c r1.2 in C:\Users\geitb/text-fabric-data
Using etcbc/parallels/tf - c r1.2 in C:\Users\geitb/text-fabric-data


**Documentation:** <a target="_blank" href="https://etcbc.github.io/bhsa" title="provenance of BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis">BHSA</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Writing/Hebrew" title="('Hebrew characters and transcriptions',)">Character table</a> <a target="_blank" href="https://etcbc.github.io/bhsa/features/hebrew/c/0_home.html" title="BHSA feature documentation">Feature docs</a> <a target="_blank" href="https://github.com/annotation/app-bhsa" title="bhsa API documentation">bhsa API</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Api/Fabric/" title="text-fabric-api">Text-Fabric API 7.3.15</a> <a target="_blank" href="https://annotation.github.io/text-fabric/Use/Search/" title="Search Templates Introduction and Reference">Search Reference</a>

In [13]:
train_books = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', '2_Samuel', 
               '1_Kings', '2_Kings', 'Isaiah', 'Jeremiah', 'Ezekiel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Micah', 
               'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Psalms', 'Job', 'Proverbs', 'Ruth', 
               'Song_of_songs', 'Ecclesiastes', 'Lamentations', 'Esther', 'Daniel', 'Ezra', 'Nehemiah', 
               '1_Chronicles', '2_Chronicles']

test_books = ['Jonah']

In [14]:
def prepare_train_data(books):

    input_clauses = []
    output_pos = []
    input_chars = set()
    output_vocab = set()
    #nb_samples = 10000

    # Process english and french sentences
    #for line in range(nb_samples):
    for cl in F.otype.s("clause"): 
        
        bo, _, _ = T.sectionFromNode(cl)
        if bo not in books:
            continue
        
        if len(L.d(cl, "word")) >7:
            continue
        #eng_line = str(lines[line]).split('\t')[0]
        words = " ".join([F.g_cons.v(w) for w in L.d(cl, "word")])
        pos_prepare = [F.sp.v(w) for w in L.d(cl, "word")]
        poss = ['\t']
        for elem in pos_prepare:
            poss.append(elem)
        poss.append('\n')
    
        # Append '\t' for start of the sentence and '\n' to signify end of the sentence
        #fra_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
        input_clauses.append(words)
        output_pos.append(poss)
    
        for ch in words:
            if (ch not in input_chars):
                input_chars.add(ch)
            
        for ch in poss:
            if (ch not in output_vocab):
                output_vocab.add(ch)
                
    output_vocab = sorted(list(output_vocab))
    input_chars = sorted(list(input_chars))
    
    max_len_input = max([len(line) for line in input_clauses])
    max_len_output = max([len(line) for line in output_pos])
    
    return input_clauses, output_pos, input_chars, output_vocab, max_len_input, max_len_output

In [15]:
def prepare_test_data(books):

    input_clauses = []
    #output_pos = []
    #input_chars = set()
    #output_vocab = set()
    #nb_samples = 10000

    # Process english and french sentences
    #for line in range(nb_samples):
    for cl in F.otype.s("clause"): 
        
        bo, _, _ = T.sectionFromNode(cl)
        if bo not in books:
            continue
        
        if len(L.d(cl, "word")) > 7:
            continue
        #eng_line = str(lines[line]).split('\t')[0]
        words = " ".join([F.g_cons.v(w) for w in L.d(cl, "word")])
        #pos_prepare = [F.sp.v(w) for w in L.d(cl, "word")]
        #poss = ['\t']
        #for elem in pos_prepare:
        #    poss.append(elem)
        #poss.append('\n')
    
        # Append '\t' for start of the sentence and '\n' to signify end of the sentence
        #fra_line = '\t' + str(lines[line]).split('\t')[1] + '\n'
        input_clauses.append(words)
        #output_pos.append(poss)
    
        #f#or ch in words:
        #    if (ch not in input_chars):
        #        input_chars.add(ch)
            
        #for ch in poss:
        #    if (ch not in output_vocab):
        #        output_vocab.add(ch)
                
    #output_vocab = sorted(list(output_vocab))
    #input_chars = sorted(list(input_chars))
    
    #max_len_input = max([len(line) for line in input_clauses])
    #max_len_output = max([len(line) for line in output_pos])
    
    return input_clauses

In [16]:
def create_dicts(input_chars, output_vocab):

    # dictionary to index each english character - key is index and value is english character
    eng_index_to_char_dict = {}

    # dictionary to get english character given its index - key is english character and value is index
    eng_char_to_index_dict = {}

    for k, v in enumerate(input_chars):
        eng_index_to_char_dict[k] = v
        eng_char_to_index_dict[v] = k
        
    # dictionary to index each french character - key is index and value is french character
    fra_index_to_char_dict = {}

    # dictionary to get french character given its index - key is french character and value is index
    fra_char_to_index_dict = {}
    for k, v in enumerate(output_vocab):
        fra_index_to_char_dict[k] = v
        fra_char_to_index_dict[v] = k
        
    return eng_index_to_char_dict, eng_char_to_index_dict, fra_index_to_char_dict, fra_char_to_index_dict

In [17]:
def one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, eng_char_to_index_dict, fra_char_to_index_dict, input_clauses, output_pos):
    tokenized_input_data = np.zeros(shape = (nb_samples,max_len_input,len(input_chars)), dtype='float32')
    tokenized_output = np.zeros(shape = (nb_samples,max_len_output,len(output_vocab)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_output, len(output_vocab)),dtype='float32')
    # Vectorize the english and french sentences

    for i in range(nb_samples):
        for k,ch in enumerate(input_clauses[i]):
            tokenized_input_data[i,k,eng_char_to_index_dict[ch]] = 1
        
        for k,ch in enumerate(output_pos[i]):
            tokenized_output[i,k,fra_char_to_index_dict[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i,k-1,fra_char_to_index_dict[ch]] = 1
                
    return tokenized_input_data, tokenized_output, target_data

In [26]:
nb_samples = 10000

input_clauses, output_pos, input_chars, output_vocab, max_len_input, max_len_output = prepare_train_data(train_books)
eng_index_to_char_dict, eng_char_to_index_dict, fra_index_to_char_dict, fra_char_to_index_dict = create_dicts(input_chars, output_vocab)
tokenized_input, tokenized_output, target_data = one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, eng_char_to_index_dict, fra_char_to_index_dict, input_clauses, output_pos)

In [27]:
test_clauses = prepare_test_data(test_books)
tokenized_test_data, _, _ = one_hot_encode(len(test_clauses), max_len_input, max_len_output, input_chars, output_vocab, eng_char_to_index_dict, fra_char_to_index_dict, test_clauses, output_pos)

In [29]:
def define_LSTM_model(input_chars, output_vocab):

    # Encoder model

    encoder_input = Input(shape=(None,len(input_chars)))
    encoder_LSTM = LSTM(512,activation = 'relu',return_state = True, return_sequences=True)(encoder_input)
    encoder_LSTM = LSTM(512,return_state = True)(encoder_LSTM)
    encoder_outputs, encoder_h, encoder_c = encoder_LSTM
    encoder_states = [encoder_h, encoder_c]
    
    # Decoder model

    decoder_input = Input(shape=(None,len(output_vocab)))
    decoder_LSTM = LSTM(512, return_sequences=True, return_state = True)
    decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
    decoder_dense = Dense(len(output_vocab), activation='softmax')
    decoder_out = decoder_dense (decoder_out)
    
    model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])

    model.summary()

    return model

In [30]:
def compile_and_train(model, tokenized_input, tokenized_output, batch_size, epochs, validation_split):

    model.compile(optimizer='adam', loss='categorical_crossentropy')
    model.fit(x=[tokenized_input,tokenized_output], 
              y=target_data,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=validation_split)
    
    return model

In [32]:
model = define_LSTM_model(input_chars, output_vocab)
model = compile_and_train(model, tokenized_input, tokenized_output, 128, 5, 0.1)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, None, 25)     0                                            
__________________________________________________________________________________________________
lstm_19 (LSTM)                  [(None, None, 512),  1101824     input_13[0][0]                   
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, None, 16)     0                                            
__________________________________________________________________________________________________
lstm_20 (LSTM)                  [(None, 512), (None, 2099200     lstm_19[0][0]                    
                                                                 lstm_19[0][1]                    
          

In [14]:
# Inference models for testing

# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(512,))
decoder_state_input_c = Input(shape=(512,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [15]:
def decode_seq(inp_seq):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(output_vocab)))
    target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        
        if (sampled_fra_char == '\n'): #or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(output_vocab)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent



In [16]:
for seq_index in range(100):
    inp_seq = tokenized_test_data[seq_index:seq_index+1]
    
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', test_clauses[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: L >MR
Decoded sentence: prepverb

-
Input sentence: QWM
Decoded sentence: verb

-
Input sentence: LK >L NJNWH H <JR H GDWLH
Decoded sentence: verbprepsubsartsubsartprde

-
Input sentence: W QR> <LJH
Decoded sentence: conjverbprep

-
Input sentence: KJ <LTH R<TM L PNJ
Decoded sentence: conjverbsubsprepsubs

-
Input sentence: W JQM JWNH
Decoded sentence: conjverbnmpr

-
Input sentence: L BRX TRCJCH M L PNJ JHWH
Decoded sentence: prepverbsubsprepsubsartsubs

-
Input sentence: W JRD JPW
Decoded sentence: conjverbsubs

-
Input sentence: W JMY> >NJH
Decoded sentence: conjverbprep

-
Input sentence: B>H TRCJC
Decoded sentence: verbnmpr

-
Input sentence: W JTN FKRH
Decoded sentence: conjverbnmpr

-
Input sentence: W JRD BH
Decoded sentence: conjverbprep

-
Input sentence: W JHJ S<R GDWL B  JM
Decoded sentence: conjverbsubsprepsubsartsubs

-
Input sentence: W H >NJH XCBH
Decoded sentence: conjartsubsverb

-
Input sentence: L HCBR
Decoded sentence: prepverb

-
Input sentence: 