# Predict POS of non-biblical scrolls

This notebook is a follow up of our blog on etcbc.nl in May 2019. In this notebook a model is trained on the BHSA texts and some scrolls in the extrabiblical package. Predictions of POS are made on the texts of the dss package. Various refinements can be made, but it works so far!

In [None]:
import collections
import pickle

import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from statistics import mode

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import model_from_json

Import and load the extrabiblical data

In [None]:
from tf.fabric import Fabric

TF = Fabric(locations='~/github/extrabiblical/tf/0.2')

In [None]:
api = TF.load('''
    otype mother lex typ code sp book chapter verse label language
''')

api.loadLog()
api.makeAvailableIn(globals())

# Give classes a new name. This prevents that they will be overwritten
Tx = T
Lx = L
Fx = F

Load the dss package and give classes a new name.

In [None]:
from tf.app import use
A = use('dss', hoist=globals())

Tdss = T
Ldss = L
Fdss = F

Load the BHSA.

In [None]:
from tf.app import use
A = use('bhsa', hoist=globals())

In the function prepare_train_data() the train set is created, and some other useful information is collected. The argument of the function, test_book, is the book which will be excluded from the train set, because it is upon this book that the model will be tested. In the blog we trained on sequences of eight words, which worked well, but it is less convenient in the case of making predictions on the scrolls, because in th scrolls many small pieces of text occur with only a few words. Therefore the training set consists of sequences of 2, 4, 6 and 8 words.

Also, 1QHa and 1QS are included in the training set.

In [None]:
def prepare_train_data(test_book):

    input_seqs = []
    output_pos = []
    input_chars = set()
    output_vocab = set()

    # iterate over all the books
    for bo in F.otype.s("book"): 
        
        # exclude the test_book
        if F.book.v(bo) == test_book:
            continue
               
        # all the words from a book are collected
        words = L.d(bo, 'word')
        
        # Now we iterate over all the words, except the last words, because all the sequences have to be 8 words long
        for w in words[0:-7]:
            
            languages = [F.g_cons.v(w) for w in range(w, w+8) if (F.g_cons.v(w) != '')]
            if 'Aramaic' in languages:
                continue
            
            seqs  = []
            
            # create sequences of various lengths
            for length in [2, 4, 6, 8]:
                g_cons_train = (" ".join([F.g_cons.v(w) for w in range(w, w+length) if (F.g_cons.v(w) != '')])).strip()
                parts_of_speech = [F.sp.v(w) for w in range(w, w+length) if (F.g_cons.v(w) != '')]
                parts_of_speech = ['\t'] + parts_of_speech + ['\n']
              
                input_seqs.append(g_cons_train)
            
                output_pos.append(parts_of_speech)

                for ch in g_cons_train:
                    input_chars.add(ch)
            
            # also collected is the output vocabulary, which consists of all the parts of speech in the etcbc database
                for pos in parts_of_speech:
                    output_vocab.add(pos)
     
    # iterate over all the books of the extrabiblical data
    for bo in Fx.otype.s("book"): 

        if Fx.book.v(bo) not in {'B_1QS', 'B_1QHa'}:
            continue
               
        words = Lx.d(bo, 'word')
        
        for w in words[0:-7]:
            
            g_cons_list = []
            parts_of_speech = []
            
            # Here only sequences of 8 words are selected, maybe improve
            for w in range(w, w+8): 
                
                languages = [Fx.language.v(w) for w in range(w, w+8) if (Fx.g_cons.v(w) != '')]
                if 'Aramaic' in languages:
                    continue
                
                if Fx.g_cons.v(w) in {'', None}:
                    continue
                           
                elif Fx.g_suffix.v(w) == '' and Fx.sp.v(w+1) == 'prps':
                    if type(Fx.g_cons.v(w)) == 'str' and type(Fx.g_cons.v(w+ 1)) == 'str':
                        g_cons_list.append(Fx.g_cons.v(w) + Fx.g_cons.v(w+1))
                        parts_of_speech.append(Fx.sp.v(w))
        
                elif Fx.g_suffix.v(w-1) == '' and Fx.sp.v(w) == 'prps':
                    continue
        
                else:
                    g_cons_list.append(Fx.g_cons.v(w))
                    parts_of_speech.append(Fx.sp.v(w))
            
            if len(g_cons_list) < 6:
                continue
            
            g_cons_train = ' '.join(g_cons_list)   
            parts_of_speech = ['\t'] + parts_of_speech + ['\n']

            for ch in g_cons_train:
                input_chars.add(ch)
            
            # also collected is the output vocabulary, which consists of all the parts of speech in the etcbc database
            for pos in parts_of_speech:
                output_vocab.add(pos)    
                
            input_seqs.append(g_cons_train)
            output_pos.append(parts_of_speech)             
    
    input_chars = sorted(list(input_chars))
    output_vocab = sorted(list(output_vocab))
    
    # in the LSTM network all the sequences have to have the same length
    max_len_input = max([len(clause) for clause in input_seqs])
    max_len_output = max([len(poss) for poss in output_pos])
    
    # shuffle the data.
    input_seqs, output_pos = shuffle(input_seqs, output_pos)
    
    return input_seqs, output_pos, input_chars, output_vocab, max_len_input, max_len_output

In [None]:
def create_dicts(input_voc, output_voc):
    
    # these dicts map the input sequences
    input_idx2char = {}
    input_char2idx = {}

    for k, v in enumerate(input_voc):
        input_idx2char[k] = v
        input_char2idx[v] = k
     
    # and these dicts map the output sequences of parts of speech
    output_idx2char = {}
    output_char2idx = {}
    
    for k, v in enumerate(output_voc):
        output_idx2char[k] = v
        output_char2idx[v] = k
        
    return input_idx2char, input_char2idx, output_idx2char, output_char2idx

Now the final data preparation function is made. Categorical data are generally fed to the LSTM network in one-hot encoded form. The inputs and the outputs have the same length. Also created is an array called decoder_target.

In [None]:
def one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_clauses, output_pos):
    
    # three-dimensional numpy arrays are created 
    tokenized_input = np.zeros(shape = (nb_samples, max_len_input, len(input_chars)), dtype='float32')
    tokenized_output = np.zeros(shape = (nb_samples, max_len_output, len(output_vocab)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_output, len(output_vocab)), dtype='float32')

    for i in range(nb_samples):
        for k, ch in enumerate(input_clauses[i]):
            tokenized_input[i, k, input_char2idx[ch]] = 1
        
        for k, ch in enumerate(output_pos[i]):
            tokenized_output[i, k, output_char2idx[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i, k-1, output_char2idx[ch]] = 1
                
    return tokenized_input, tokenized_output, target_data

In the function define_LSTM_model() the architecture of the model is created. Neural networks are very flexible structures and a variety of architectures have been developed for various tasks. Here we use the encoder-decoder architecture with two LSTM layers in the encoder. In the architecture there is a variety of hyperparameters that you have to choose. Better hyperparameters lead to better predictions, so it is important to spend time on optimizing this. Hyperparameters in this architecture are the number of LSTM layers, the number of cells in each LSTM layer and the activation function.

In [None]:
def define_LSTM_model(input_chars, output_vocab):

    # encoder model
    encoder_input = Input(shape=(None,len(input_chars)))
    encoder_LSTM = LSTM(350,activation='relu',return_state=True, return_sequences=True)(encoder_input)
    encoder_LSTM = LSTM(350,return_state=True)(encoder_LSTM)
    encoder_outputs, encoder_h, encoder_c = encoder_LSTM
    encoder_states = [encoder_h, encoder_c]
    
    # decoder model
    decoder_input = Input(shape=(None,len(output_vocab)))
    decoder_LSTM = LSTM(350, return_sequences=True, return_state = True)
    decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
    decoder_dense = Dense(len(output_vocab), activation='softmax')
    decoder_out = decoder_dense (decoder_out)
    
    model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])

    model.summary()

    return encoder_input, encoder_states, decoder_input, decoder_LSTM, decoder_dense, model

Now the model is compiled and trained using the function compile_and_train(). The data are fed to the model in small batches. The train data are split in a train and validation set. The latter data consist of 5% of the original train set. The model is trained on the train set, and makes a prediction on these data. The difference between the predictions and the true values of the output are calculated with categorical crossentropy and is called the loss. During training this loss becomes smaller, which means that the predictions become more accurate. However, we want the model not only to become good on the train data, but it should be general enough to make accurate predictions on unseen data. Therefore, after every epoch a prediction is made on the small validation set and the validation loss is calculated. Ideally, the validation loss is more or less equal to the train loss. After a number of epochs, you will notice that the train loss keeps decreasing, while the validation loss remains equal or even increases. At this point the model starts to overfit, which means that the algorithm is modeling idiosyncrasies in the train data instead of general patterns. In that case it is time to stop training and make predictions on the test set.

Again, you have to choose a number of hyperparameters. These are the optimizer, the loss function, the batch size, the number of epochs and the learning rate. If you want, you can even tune more hyperparameters.

With Earlystopping() the training process can be stopped earlier than the given number of epochs. This is useful if the model starts overfitting and the validation loss does not decrease anymore.

Note that training an LSTM model is a computationally intensive process. It is recommended to run the script on a GPU.

In [None]:
def compile_and_train(model, one_hot_in, one_hot_out, targets, batch_size, epochs, val_split):

    callback = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
    adam = Adam(lr=0.00055, beta_1=0.99, beta_2=0.999, epsilon=0.00000001)
    model.compile(optimizer=adam, loss='categorical_crossentropy')
    model.fit(x=[one_hot_in,one_hot_out], 
              y=targets,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=val_split,
              callbacks=[callback])
    
    return model

The train data are prepared. The test data consist of sequences of words from the book of Nehemiah, so in the preparation of the train data, Nehemiah is excluded.

In [None]:
test_book = "no_test_book"

input_clauses, output_pos, input_chars, output_vocab, max_len_input, max_len_output = prepare_train_data(test_book)
input_idx2char, input_char2idx, output_idx2char, output_char2idx = create_dicts(input_chars, output_vocab)

nb_samples = len(input_clauses)
one_hot_input, one_hot_output, target_data = one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_clauses, output_pos)

In [None]:
len(input_clauses)

Here the functions define_LSTM_model() and compile_and_train() are called. A neural network learns in an iterative process. One iteration is called an epoch. In each iteration a prediction is made, and the train and validation loss are calculated, as you can see in the output.

The architecture of the model is also printed with the number of parameters. You also see the number of train samples (397552 samples).

In [None]:
encoder_input, encoder_states, decoder_input, decoder_LSTM, decoder_dense, model = define_LSTM_model(input_chars, output_vocab)
model = compile_and_train(model, one_hot_input, one_hot_output, target_data, 1024, 150, 0.05)

In [None]:
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_state_input_h = Input(shape=(350,))
decoder_state_input_c = Input(shape=(350,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

### Make predictions on scroll

In [None]:
# which scroll?

dss_book = '11Q19'

Many srolls are in a fragmentary state, which means that of the text only small pieces remain. The function find_seqs finds places where the text of a scroll is continuous.

In [None]:
def find_seqs(words):
    
    all_seqs = []
    seq = []

    for word in words:
        
        text_word = Tdss.text(word, fmt='text-trans-extra')

        if text_word != ' 0  ':
            seq.append(word)
        else:
            if len(seq) > 0:
                all_seqs.append(seq)
                seq = []
                
    if len(seq) > 0:
        all_seqs.append(seq)
                
    return(all_seqs)

In [None]:
for scr in Fdss.otype.s('scroll'):
    scroll_name = Tdss.scrollName(scr)

    if scroll_name != dss_book:
        continue
        
    words = Ldss.d(scr, 'word')

    
    all_seqs = find_seqs(words)

# Prepare data and predict

The basis for the analysis are sequences of 8 words. However, some scattered texts contain shorter sequences. These are processed separately.

In [None]:
def preprocess_glyphs(all_glyphs, all_lexemes):
    """
    the consonant '#' is used for both 'C' and 'F'. We check in the lexeme
    to which of the two alternatives it should be converted. This appproach is crude, 
    but works well in general.
    """
    
    for i in range(len(all_glyphs)):
        
        # convert final dss conconants to their etcbc counterparts
        all_glyphs[i] = (all_glyphs[i]).replace(u'\xa0', u' ').replace("'", "").replace("k", "K").replace("n", "N").replace("m", "M").replace("y", "Y").replace("p", "P")
        
        if '#' not in all_glyphs[i]:
            continue
                
        if all_lexemes[i] == None:
            all_glyphs[i] = (all_glyphs[i]).replace('#', 'C') 
                    
        elif 'F' in all_lexemes[i]:
            all_glyphs[i] = (all_glyphs[i]).replace('#', 'F')                        

        else:
            all_glyphs[i] = (all_glyphs[i]).replace('#', 'C')
    
    return(all_glyphs)
            

In [None]:
def prepare_test_data_dss_module(seq, max_len_input):
    """
    Function used for preparation of data from dss package
    """
    
    word_nodes = []
    input_seqs_test = []
    output_seqs_test = []
    g_cons_test = []
    pos_test = [] 
    relevant_words = []

    if len(seq) < 8:

            
        #languages = [Fdss.lang.v(seq[w]) for w in seq if (Fdss.glyphe.v(seq[w]) != '')]
        #if 'a' in languages or 'g' in languages:
        #    return('', [], [])
            
        all_lexemes = [Fdss.glexe.v(w) for w in seq if (Fdss.glyphe.v(w) not in ('', None))]

        all_glyphs = [Fdss.glyphe.v(w) for w in seq if (Fdss.glyphe.v(w) not in ('', None))]
            
        all_glyphs = preprocess_glyphs(all_glyphs, all_lexemes)

            
        # excluded are the g_conse values '' and None
        g_cons_train = (" ".join(all_glyphs)).strip()
        
        parts_of_speech = [Fdss.sp.v(w) for w in seq if (Fdss.glyphe.v(w) not in {'', None})]

        parts_of_speech = ['\t'] + parts_of_speech + ['\n']

        input_seqs_test.append(g_cons_train)

        output_seqs_test.append(parts_of_speech)
        
        nodes = [w for w in seq if (Fdss.glyphe.v(w) not in ('', None))]
        word_nodes.append(nodes)
    
    # now longer sequences are processed
    else:

        for w in range(len(seq) - 7): 
            
            if Fdss.glyphe.v(seq[w]) in ('', None):
                continue
            
            languages = [Fdss.lang.v(seq[w]) for w in range(w, w+8) if (Fdss.glyphe.v(seq[w]) != '')]
            if 'a' in languages:
                continue
            if 'g' in languages:
                continue
            
            all_lexemes = [Fdss.glexe.v(seq[w]) for w in range(w, w+8) if (Fdss.glyphe.v(seq[w]) not in ('', None) and Tdss.text(seq[w], fmt='text-trans-extra') != "00 ")]

            all_glyphs = [Fdss.glyphe.v(seq[w]) for w in range(w, w+8) if (Fdss.glyphe.v(seq[w]) not in ('', None) and Tdss.text(seq[w], fmt='text-trans-extra') != "00 ")]

            all_glyphs = preprocess_glyphs(all_glyphs, all_lexemes)
            
            # excluded are the g_conse values '' and None
            g_cons_train = (" ".join(all_glyphs)).strip()

            # sometimes greek letters occur, check for this
            hebrew = True
            for cons in g_cons_train:
                if cons not in input_char2idx:
                    hebrew = False
      
            if hebrew == False:
                continue
            
            if len(g_cons_train) > max_len_input:               
                continue
                
            parts_of_speech = [Fdss.sp.v(seq[w]) for w in range(w, w+8) if (Fdss.glyphe.v(seq[w]) not in {'', None} and Tdss.text(seq[w], fmt='text-trans-extra') != "00 ")]
            parts_of_speech = ['\t'] + parts_of_speech + ['\n']
            
            input_seqs_test.append(g_cons_train)

            output_seqs_test.append(parts_of_speech)
            
            nodes = [seq[w] for w in range(w, w+8) if (Fdss.glyphe.v(seq[w]) not in {'', None} and Tdss.text(seq[w], fmt='text-trans-extra') != "00 ")]           
            word_nodes.append(nodes)
            
    return input_seqs_test, output_seqs_test , word_nodes

In [None]:
def one_hot_encode2(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_clauses, output_pos):
    """
    Function is used for new data, without validation
    """
    
    # three-dimensional numpy arrays are created 
    tokenized_input = np.zeros(shape = (nb_samples, max_len_input, len(input_chars)), dtype='float32')

    for i in range(nb_samples):
        for k, ch in enumerate(input_clauses[i]):

            tokenized_input[i, k, input_char2idx[ch]] = 1

    return tokenized_input

In [None]:
def decode_seq(inp_seq):
    """
    This function predicts the POS on the basis of an input sequence
    The input is a one-hot encoded sequence of Hebrew words
    The output is a list of POS
    """
    
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(output_vocab)))
    target_seq[0, 0, output_char2idx['\t']] = 1
    
    pred_pos = []
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_out_char = output_idx2char[max_val_index]
        pred_pos.append(sampled_out_char)
        
        if (sampled_out_char == '\n'):
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(output_vocab)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return pred_pos

In [None]:
all_predictions = collections.defaultdict(list)

# loop over continuous pieces of text
for seq in all_seqs:

    if len(seq) == 0:
        continue
    
    # exclude 'interpunction signs'
    seq = [w for w in seq if Tdss.text(w, fmt='text-trans-extra') != "00 "]
    
    # prepare data and create one-hot encoding
    input_seqs_dss, output_seqs_test_dss, words_list = prepare_test_data_dss_module(seq, max_len_input)
    one_hot_dss = one_hot_encode2(len(input_seqs_dss), max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_seqs_dss, output_seqs_test_dss)

    print(dss_book, len(input_seqs_dss), one_hot_dss.shape)

    for seq_index in range(len(one_hot_dss)):
    
        if len(words_list[seq_index]) == 0:
            continue
            
        inp_seq = one_hot_dss[seq_index:seq_index+1]
    
        pred_pos = decode_seq(inp_seq)
        pred_pos = pred_pos[:-1]

        print(input_seqs_dss[seq_index])
        print(pred_pos)
        print(' ')
        
        if len(words_list[seq_index]) != len(pred_pos):
            continue
            
        for pred_ind in range(len(pred_pos)):
            all_predictions[words_list[seq_index][pred_ind]].append(pred_pos[pred_ind]) 

## Process predictions

In [None]:
all_words = sorted(list(all_predictions.keys()))

In [None]:
from collections import Counter 
  
def most_frequent(item_list): 
    occurence_count = Counter(item_list) 
    return occurence_count.most_common(1)[0][0] 

In [None]:
tf_word_id = []
glyphe = []
poss_dss = []
pos_etcbc = []

for key in all_words:

    if len(all_predictions[key]) == 0:
        continue
        
    data = collections.Counter(all_predictions[key])

    pos_dss = Fdss.sp.v(key)
    print(key , Fdss.glyphe.v(key), Fdss.sp.v(key), most_frequent(all_predictions[key]))
    
    tf_word_id.append(key)
    glyphe.append(Fdss.glyphe.v(key))
    poss_dss.append(pos_dss)
    
    # if the dss package says the pos is unknown, we adopt that and overrule our prediction
    if pos_dss == 'unknown':
        pos_etcbc.append('unknown')
        
    else:
        pos_etcbc.append(most_frequent(all_predictions[key]))

In [None]:
dss_df = pd.DataFrame(list(zip(tf_word_id, glyphe, poss_dss, pos_etcbc)), 
               columns =['tf_word_id', 'g_cons', 'pos_dss', 'pos_etcbc']) 

file_name = dss_book + '_pos.csv'

dss_df.to_csv(file_name, index=False)

In [None]:
for scr in Fdss.otype.s('scroll'):
    scroll_name = Tdss.scrollName(scr)
    print(scroll_name)