In [1]:
import os, collections

import pandas as pd
import numpy as np

from sklearn.utils import shuffle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/your_project_folder/' 

Mounted at /content/gdrive


In [3]:
def read_data_from_file(filename, data_dict):

    with open(filename) as fp:
        line = fp.readline()
        while line:
            bo, ch, ve, text = tuple(line.strip().split('\t'))
            words = text.split()
            for w in words:  
                # in the output data, composite placenames have a '_', which cannot be found in the input data
                words_split = w.split('_')               
                for word_split in words_split:
                    data_dict[bo].append(word_split)
        
            line = fp.readline()
            
    return data_dict

In [7]:
input_file = '/content/ssi_morphology/data/t-in_voc'
input_data = collections.defaultdict(list)

output_file = '/content/ssi_morphology/data/t-out'
output_data = collections.defaultdict(list)

input_data = read_data_from_file(input_file, input_data)
output_data = read_data_from_file(output_file, output_data)

In [8]:
print(len(input_data['Gen']))
print(len(output_data['Gen']))

20611
20611


In [9]:
input_data['Gen'][0:10]

['B.:R;>CIJT',
 'B.@R@>',
 '>:ELOHIJM',
 '>;T',
 'HAC.@MAJIM',
 'W:>;T',
 'H@>@REY',
 'W:H@>@REY',
 'H@J:T@H',
 'TOHW.']

In [11]:
output_data['Gen'][0:10]

['B-R>CJT/',
 'BR>[',
 '>LH(J(M/JM',
 '>T',
 'H-CMJ(M/(JM',
 'W->T',
 'H->RY/:a',
 'W-H->RY/:a',
 'HJ(H[&TH',
 'THW/']

In [12]:
def make_in_out_sequences(data_dict, sequence_length):
    
    all_sequences = []
    for words_list in data_dict.values():

        for w in range(len(words_list) - sequence_length + 1):
    
            seq = ' '.join([words_list[ind] for ind in list(range(w, w + sequence_length))])
        
            # remove some special signs from output data (':', and '='). These only make the sequences longer.
            seq = seq.replace("=", "").replace(":a", "a").replace(":c", "c").replace(":d", "d").replace(":du", "du")
            all_sequences.append(seq)
        
    return all_sequences

In [13]:
sequence_length = 1

all_in_seqs = make_in_out_sequences(input_data, sequence_length)
all_out_seqs = make_in_out_sequences(output_data, sequence_length)


In [14]:
all_in_seqs[0:10]

['B.:R;>CIJT',
 'B.@R@>',
 '>:ELOHIJM',
 '>;T',
 'HAC.@MAJIM',
 'W:>;T',
 'H@>@REY',
 'W:H@>@REY',
 'H@J:T@H',
 'TOHW.']

In [15]:
print(len(all_in_seqs))
print(len(all_out_seqs))

300676
300676


In [16]:
for i in range(206000,206020):
  print(all_in_seqs[i], '---', all_out_seqs[i])

B.:BOW>@M --- B-!!BW>[/+M
J@BOW> --- !J!BW>[
W.B:Y;>T@M --- W-B-!!(JY>[/T+M
J;Y;>W. --- !J!(JY>[W
W.BAXAG.IJM --- W-B-(H-XG/JM
W.BAM.OW<:ADIJM --- W-B-(H-MW<D/JM
T.IH:JEH --- !T!HJH[
HAM.IN:X@H --- H-MNX(H/H
>;JP@H --- >JP(H/H
LAP.@R --- L-(H-PR/a
W:>;JP@H --- W->JP(H/H
L@>AJIL --- L-(H->JL/a
W:LAK.:B@FIJM --- W-L-(H-KBF/JM
MAT.AT --- MTT/
J@DOW --- JD/+W
W:CEMEN --- W-CMN/
HIJN --- HJN/
L@>;JP@H --- L-(H->JP(H/H
W:KIJ --- W-KJ
JA<:AFEH --- !J!<FH[


In [17]:
def prepare_train_data(input_data, output_data):

    input_seqs = []
    output_seqs = []
    input_chars = set()
    output_chars = set()

    # iterate over all the books
    for seq in range(len(input_data)): 
      
        #if len(output_data[seq]) > 40:
        #  continue
          
        if "*" in input_data[seq]: # cases of ketiv/qere are complicated, just skip them!
          continue

        input_list = list(input_data[seq])

        output_list = list(output_data[seq])
        output_list = ['\t'] + output_list + ['\n']

        input_seqs.append(input_list)
        output_seqs.append(output_list)

        for input_ch in input_list:
            input_chars.add(input_ch)
        
        for output_ch in output_list:
            output_chars.add(output_ch)
                
    
    input_chars = sorted(list(input_chars))
    output_chars = sorted(list(output_chars))
    
    max_len_input = max([len(seq) for seq in input_seqs])
    max_len_output = max([len(seq) for seq in output_seqs])
    
    # shuffle the data. The model will get the data in small batches, it is preferable if the batches are more or less homogeneous
    # of course the inputs and outputs have to be shuffled identically
    input_seqs, output_seqs = shuffle(input_seqs, output_seqs)
    
    return input_seqs, output_seqs, input_chars, output_chars, max_len_input, max_len_output

In [18]:
def create_dicts(input_voc, output_voc):
    
    # these dicts map the input sequences
    input_idx2char = {}
    input_char2idx = {}

    for k, v in enumerate(input_voc):
        input_idx2char[k] = v
        input_char2idx[v] = k
     
    # and these dicts map the output sequences of parts of speech
    output_idx2char = {}
    output_char2idx = {}
    
    for k, v in enumerate(output_voc):
        output_idx2char[k] = v
        output_char2idx[v] = k
        
    return input_idx2char, input_char2idx, output_idx2char, output_char2idx

In [19]:
def one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_chars, input_char2idx, output_char2idx, input_seqs, output_seqs):
    
    # three-dimensional numpy arrays are created 
    tokenized_input = np.zeros(shape = (nb_samples, max_len_input, len(input_chars)), dtype='float32')
    tokenized_output = np.zeros(shape = (nb_samples, max_len_output, len(output_chars)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_output, len(output_chars)), dtype='float32')

    for i in range(nb_samples):
        for k, ch in enumerate(input_seqs[i]):
            tokenized_input[i, k, input_char2idx[ch]] = 1
        
        for k, ch in enumerate(output_seqs[i]):
            tokenized_output[i, k, output_char2idx[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i, k-1, output_char2idx[ch]] = 1
                
    return tokenized_input, tokenized_output, target_data

In [116]:
from tensorflow.keras.layers import Attention

def define_LSTM_model(input_chars, output_chars):

    # encoder model
    encoder_input = Input(shape=(None,len(input_chars)))
    encoder_LSTM = LSTM(500,activation='relu',return_state=True, return_sequences=True)(encoder_input)
    encoder_LSTM = LSTM(500,return_state=True)(encoder_LSTM)
    encoder_outputs, encoder_h, encoder_c = encoder_LSTM
    encoder_states = [encoder_h, encoder_c]
    

    # decoder model
    decoder_input = Input(shape=(None,len(output_chars)))
    decoder_LSTM = LSTM(500, return_sequences=True, return_state = True)
    decoder_out1, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
    decoder_dense = Dense(len(output_chars), activation='softmax')
    decoder_out2 = decoder_dense(decoder_out1)

    # Attention layer
    attention_layer = Attention()([encoder_outputs, decoder_out1])
  

    model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out2])

    model.summary()

    return encoder_input, encoder_states, decoder_input, decoder_out2, decoder_dense, model

In [24]:
def compile_and_train(model, one_hot_in, one_hot_out, targets, batch_size, epochs, val_split):

    callback = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
    adam = Adam(lr=0.0006, beta_1=0.995, beta_2=0.999, epsilon=0.00000001)
    model.compile(optimizer=adam, loss='categorical_crossentropy')
    model.fit(x=[one_hot_in,one_hot_out], 
              y=targets,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=val_split,
              callbacks=[callback])
    
    return model

In [25]:
input_seqs, output_seqs, input_chars, output_chars, max_len_input, max_len_output = prepare_train_data(all_in_seqs, all_out_seqs)
print(len(input_seqs))

299488


In [26]:
input_idx2char, input_char2idx, output_idx2char, output_char2idx = create_dicts(input_chars, output_chars)

nb_samples = len(input_seqs)
one_hot_input, one_hot_output, target_data = one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_chars, input_char2idx, output_char2idx, input_seqs, output_seqs)

In [27]:
train_size = 290000

one_hot_input_train = one_hot_input[0:train_size]
one_hot_output_train = one_hot_output[0:train_size]
target_data_train = target_data[0:train_size]

In [28]:
print(max_len_input, max_len_output)


23 28


In [None]:
encoder_input, encoder_states, decoder_input, decoder_out1, attention_layer, model = define_LSTM_model(input_chars, output_chars)
model = compile_and_train(model, one_hot_input_train, one_hot_output_train, target_data_train, 1024, 150, 0.05)