In [1]:
import os, collections

import pandas as pd
import numpy as np

from sklearn.utils import shuffle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/your_project_folder/' 

Mounted at /content/gdrive


In [3]:
def read_data_from_file(filename, data_dict):

    with open(filename) as fp:
        line = fp.readline()
        while line:
            bo, ch, ve, text = tuple(line.strip().split('\t'))
            words = text.split()
            for w in words:  
                # in the output data, composite placenames have a '_', which cannot be found in the input data
                words_split = w.split('_')               
                for word_split in words_split:
                    data_dict[bo].append(word_split)
        
            line = fp.readline()
            
    return data_dict

# Nieuwe sectie

In [4]:
input_file = '/content/t-in_voc'
input_data = collections.defaultdict(list)

output_file = '/content/t-out'
output_data = collections.defaultdict(list)

input_data = read_data_from_file(input_file, input_data)
output_data = read_data_from_file(output_file, output_data)

In [5]:
print(len(input_data['Gen']))
print(len(output_data['Gen']))

20611
20611


In [6]:
input_data['Gen'][0:10]

['B.:R;>CIJT',
 'B.@R@>',
 '>:ELOHIJM',
 '>;T',
 'HAC.@MAJIM',
 'W:>;T',
 'H@>@REY',
 'W:H@>@REY',
 'H@J:T@H',
 'TOHW.']

In [7]:
output_data['Gen'][0:10]

['B-R>CJT/',
 'BR>[',
 '>LH(J(M/JM',
 '>T',
 'H-CMJ(M/(JM',
 'W->T',
 'H->RY/:a',
 'W-H->RY/:a',
 'HJ(H[&TH',
 'THW/']

In [8]:
def make_in_out_sequences(data_dict, sequence_length):
    
    all_sequences = []
    for words_list in data_dict.values():

        for w in range(len(words_list) - sequence_length + 1):
    
            seq = ' '.join([words_list[ind] for ind in list(range(w, w + sequence_length))])
        
            # remove some special signs from output data (':', and '='). These only make the sequences longer.
            seq = seq.replace("=", "").replace(":a", "a").replace(":c", "c").replace(":d", "d").replace(":du", "du")
            all_sequences.append(seq)
        
    return all_sequences

In [9]:
sequence_length = 1

all_in_seqs = make_in_out_sequences(input_data, sequence_length)
all_out_seqs = make_in_out_sequences(output_data, sequence_length)


In [10]:
all_in_seqs[0:10]

['B.:R;>CIJT',
 'B.@R@>',
 '>:ELOHIJM',
 '>;T',
 'HAC.@MAJIM',
 'W:>;T',
 'H@>@REY',
 'W:H@>@REY',
 'H@J:T@H',
 'TOHW.']

In [11]:
print(len(all_in_seqs))
print(len(all_out_seqs))

300676
300676


In [12]:
for i in range(206000,206020):
  print(all_in_seqs[i], '---', all_out_seqs[i])

B.:BOW>@M --- B-!!BW>[/+M
J@BOW> --- !J!BW>[
W.B:Y;>T@M --- W-B-!!(JY>[/T+M
J;Y;>W. --- !J!(JY>[W
W.BAXAG.IJM --- W-B-(H-XG/JM
W.BAM.OW<:ADIJM --- W-B-(H-MW<D/JM
T.IH:JEH --- !T!HJH[
HAM.IN:X@H --- H-MNX(H/H
>;JP@H --- >JP(H/H
LAP.@R --- L-(H-PR/a
W:>;JP@H --- W->JP(H/H
L@>AJIL --- L-(H->JL/a
W:LAK.:B@FIJM --- W-L-(H-KBF/JM
MAT.AT --- MTT/
J@DOW --- JD/+W
W:CEMEN --- W-CMN/
HIJN --- HJN/
L@>;JP@H --- L-(H->JP(H/H
W:KIJ --- W-KJ
JA<:AFEH --- !J!<FH[


In [13]:
def prepare_train_data(input_data, output_data):

    input_seqs = []
    output_seqs = []
    input_chars = set()
    output_chars = set()

    # iterate over all the books
    for seq in range(len(input_data)): 
      
        #if len(output_data[seq]) > 40:
        #  continue
          
        if "*" in input_data[seq]: # cases of ketiv/qere are complicated, just skip them!
          continue

        input_list = list(input_data[seq])

        output_list = list(output_data[seq])
        output_list = ['\t'] + output_list + ['\n']

        input_seqs.append(input_list)
        output_seqs.append(output_list)

        for input_ch in input_list:
            input_chars.add(input_ch)
        
        for output_ch in output_list:
            output_chars.add(output_ch)
                
    
    input_chars = sorted(list(input_chars))
    output_chars = sorted(list(output_chars))
    
    max_len_input = max([len(seq) for seq in input_seqs])
    max_len_output = max([len(seq) for seq in output_seqs])
    
    # shuffle the data. The model will get the data in small batches, it is preferable if the batches are more or less homogeneous
    # of course the inputs and outputs have to be shuffled identically
    input_seqs, output_seqs = shuffle(input_seqs, output_seqs)
    
    return input_seqs, output_seqs, input_chars, output_chars, max_len_input, max_len_output

In [14]:
def create_dicts(input_voc, output_voc):
    
    # these dicts map the input sequences
    input_idx2char = {}
    input_char2idx = {}

    for k, v in enumerate(input_voc):
        input_idx2char[k] = v
        input_char2idx[v] = k
     
    # and these dicts map the output sequences of parts of speech
    output_idx2char = {}
    output_char2idx = {}
    
    for k, v in enumerate(output_voc):
        output_idx2char[k] = v
        output_char2idx[v] = k
        
    return input_idx2char, input_char2idx, output_idx2char, output_char2idx

In [15]:
def one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_chars, input_char2idx, output_char2idx, input_seqs, output_seqs):
    
    # three-dimensional numpy arrays are created 
    tokenized_input = np.zeros(shape = (nb_samples, max_len_input, len(input_chars)), dtype='float32')
    tokenized_output = np.zeros(shape = (nb_samples, max_len_output, len(output_chars)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_output, len(output_chars)), dtype='float32')

    for i in range(nb_samples):
        for k, ch in enumerate(input_seqs[i]):
            tokenized_input[i, k, input_char2idx[ch]] = 1
        
        for k, ch in enumerate(output_seqs[i]):
            tokenized_output[i, k, output_char2idx[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i, k-1, output_char2idx[ch]] = 1
                
    return tokenized_input, tokenized_output, target_data

In [27]:
from tensorflow.keras.layers import Attention, GlobalAveragePooling1D

def define_LSTM_model(input_chars, output_chars):

    # encoder model
    encoder_input = Input(shape=(None,len(input_chars)))
    encoder_LSTM = LSTM(50,activation='relu',return_state=True, return_sequences=True)(encoder_input)
    encoder_LSTM = LSTM(50,return_state=True)(encoder_LSTM)
    encoder_outputs, encoder_h, encoder_c = encoder_LSTM
    encoder_states = [encoder_h, encoder_c]
    

    # decoder model
    decoder_input = Input(shape=(None,len(output_chars)))
    decoder_LSTM = LSTM(50, return_sequences=True, return_state = True)
    decoder_out1, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
    decoder_dense = Dense(len(output_chars), activation='softmax')
    decoder_out2 = decoder_dense(decoder_out1)

    # Attention layer
    attention_layer = Attention()([encoder_outputs, decoder_out2])
    pooled_attention_layer = GlobalAveragePooling1D()(attention_layer)
  

    model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out2])

    model.summary()

    return encoder_input, encoder_states, decoder_input, decoder_out2, decoder_dense, pooled_attention_layer, model

In [17]:
def compile_and_train(model, one_hot_in, one_hot_out, targets, batch_size, epochs, val_split):

    callback = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')
    adam = Adam(lr=0.0006, beta_1=0.995, beta_2=0.999, epsilon=0.00000001)
    model.compile(optimizer=adam, loss='categorical_crossentropy')
    model.fit(x=[one_hot_in,one_hot_out], 
              y=targets,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=val_split,
              callbacks=[callback])
    
    return model

In [18]:
input_seqs, output_seqs, input_chars, output_chars, max_len_input, max_len_output = prepare_train_data(all_in_seqs, all_out_seqs)
print(len(input_seqs))

299488


In [19]:
input_idx2char, input_char2idx, output_idx2char, output_char2idx = create_dicts(input_chars, output_chars)

nb_samples = len(input_seqs)
one_hot_input, one_hot_output, target_data = one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_chars, input_char2idx, output_char2idx, input_seqs, output_seqs)

In [20]:
train_size = 290000

one_hot_input_train = one_hot_input[0:train_size]
one_hot_output_train = one_hot_output[0:train_size]
target_data_train = target_data[0:train_size]

In [21]:
print(max_len_input, max_len_output)


23 28


In [28]:
encoder_input, encoder_states, decoder_input, decoder_out2, decoder_dense, pooled_attention_layer, model = define_LSTM_model(input_chars, output_chars)
model = compile_and_train(model, one_hot_input_train, one_hot_output_train, target_data_train, 1024, 150, 0.05)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None, 32)]   0           []                               
                                                                                                  
 lstm_9 (LSTM)                  [(None, None, 50),   16600       ['input_7[0][0]']                
                                 (None, 50),                                                      
                                 (None, 50)]                                                      
                                                                                                  
 input_8 (InputLayer)           [(None, None, 42)]   0           []                               
                                                                                              

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [29]:
# Encoder inference model
encoder_model_inf = Model(encoder_input, encoder_states)

# Decoder inference model
decoder_input = Input(shape=(None,len(output_chars)))
decoder_LSTM = LSTM(50, return_sequences=True, return_state = True)
decoder_out1, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(output_chars), activation='softmax')
decoder_out2 = decoder_dense(decoder_out1)
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]
decoder_LSTM = LSTM(50, return_sequences=True, return_state = True)

decoder_out1, decoder_h, decoder_c = decoder_LSTM(decoder_input)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out1)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [30]:
def decode_seq(inp_seq):

    states_val = encoder_model_inf.predict(inp_seq)

    target_seq = np.zeros((1, 1, len(output_chars)))
    target_seq[0, 0, output_char2idx['\t']] = 1

    predicted_seq = []
    stop_condition = False
    

    while not stop_condition:

        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)

        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_out_char = output_idx2char[max_val_index]
        predicted_seq.append(sampled_out_char)

        if (sampled_out_char == '\n' or len(predicted_seq) >= max_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, len(output_chars)))
        target_seq[0, 0, max_val_index] = 1

        states_val = [decoder_h, decoder_c]

    return predicted_seq

In [31]:
all_preds = []

in_ind = train_size + 1
out_ind = len(one_hot_input)
max_length = 15

#in_ind = 0
#out_ind = 1000
  
inputs = input_seqs[in_ind:out_ind]
one_hot_test_data = one_hot_input[in_ind:out_ind]
output_seqs_test = output_seqs[in_ind:out_ind]


for seq_index in range(len(one_hot_test_data)):
    inp_seq = one_hot_test_data[seq_index:seq_index+1]
    
    predicted_sequence = decode_seq(inp_seq)
    
    true_val = ''.join(output_seqs_test[seq_index][1:-1])
    pred_val = ''.join(predicted_sequence[0:-1])
    #print(''.join(inputs[seq_index]), true_val, pred_val)
    print(seq_index)
    all_preds.append((''.join(inputs[seq_index]), true_val, pred_val))

[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
46

In [32]:
corr = 0

for item in all_preds:
  if item[1] == item[2]:
    corr += 1
    print(item[0], item[1], item[2], 'correct')
  else:
    print(item[0], item[1], item[2], 'wrong')
 
print(len(all_preds))
print(corr/len(all_preds))

[1;30;43mStreaminguitvoer ingekort tot de laatste 5000 regels.[0m
V@M;> VM>/a N-K[K[K[K[K[K[ wrong
L:P@N@JW L-PN(H/J+W N-K[K[K[K[K[K[ wrong
WAJ.AN:<ILW.M W:n-!J!](H]N<L[W+M N-K[K[K[K[K[K[ wrong
C:MOWT CM/WTc N-K[K[K[K[K[K[ wrong
W:H;M W-HM N-K[K[K[K[K[K[ wrong
L:D@WID L-DWD/ N-K[K[K[K[K[K[ wrong
CIC.IJM CC/JM N-K[K[K[K[K[K[ wrong
BIL:H@N BLHN/ N-K[K[K[K[K[K[ wrong
HAX:AJ@LIJM H-XJL/JM N-K[K[K[K[K[K[ wrong
WAJ.A<AN W:n-!J!<N(H[ N-K[K[K[K[K[K[ wrong
J:HW@H JHWH/ N-K[K[K[K[K[K[ wrong
VOWB@H VWB/H N-K[K[K[K[K[K[ wrong
W:LO> W-L> N-K[K[K[K[K[K[ wrong
W.M;<AJ W-M<(J(M/J+(J N-K[K[K[K[K[K[ wrong
VOWB VWB/ N-K[K[K[K[K[K[ wrong
<@LAJIK: <L&J+K N-K[K[K[K[K[K[ wrong
>IT.OW >T+W N-K[K[K[K[K[K[ wrong
B.AXAL.OWN B-(H-XLWN/ N-K[K[K[K[K[K[ wrong
<AM <M/ N-K[K[K[K[K[K[ wrong
W:>EL W->L N-K[K[K[K[K[K[ wrong
JIF:R@>;L JFR>L/ N-K[K[K[K[K[K[ wrong
>:ACER >CR N-K[K[K[K[K[K[ wrong
>AT.: >T N-K[K[K[K[K[K[ wrong
P.EVER PVR/ N-K[K[K[K[K[K[ wrong
P.AR:<OC PR<C/ N-K[K[K[K[K[K[ wrong
HAN.IC:>@RIJM H-]N]C>R[/JM N-

In [29]:
len(all_preds)

9487