## Imports and variable setup

In [1]:
from utils import retrieve_texts, DataObject, data_shapes, token_integer_mapping, prepare_model_data, shape_info
from keras.models import load_model
from tqdm import tqdm
import os
import pickle


data_dir_train = 'data/train/'                                    # Path to the train data txt files on disk
data_dir_test = 'data/test/'                                      # Path to the test data txt files on disk
model_path = 'checkpoints/c2p_att_plain_lat3072_b32_iter23.hdf5'  # Path to model checkpoints
result_dir = 'results/'
num_samples_train = 5001                                          # Number of samples to train on (almost)
num_samples_test = 200                                            # Number of samples to test on
# Maximum length for inputs and outputs (in terms of characters, not tokens)
max_input_length = 1000000                                        # Number of largest acceptibale input length
max_target_length = 1800                                          # Number of largest acceptibale target length

Using TensorFlow backend.


## Data processing

In [2]:
# Get training data
input_texts, target_texts, input_lists, target_lists, input_tokens, target_tokens = retrieve_texts(data_dir_train, num_samples_train, max_input_length, max_target_length)
train_do = DataObject(input_texts, target_texts, input_lists, target_lists, input_tokens, target_tokens)
# Data shapes
num_encoder_tokens_train, num_decoder_tokens_train, max_encoder_seq_length_train, max_decoder_seq_length_train, n_input_samples_train = data_shapes(train_do)

# Get testing data
input_texts, target_texts, input_lists, target_lists, input_tokens, target_tokens = retrieve_texts(data_dir_test, num_samples_test, max_input_length, max_target_length)
test_do = DataObject(input_texts, target_texts, input_lists, target_lists, input_tokens, target_tokens)
# Data shapes
num_encoder_tokens_test, num_decoder_tokens_test, max_encoder_seq_length_test, max_decoder_seq_length_test, n_input_samples_test = data_shapes(test_do)

# Converting tokens to integers (Neural Networks accept only integers as inputs), and
# reverse-lookup token index to decode sequences back to something readable.
input_token_index, target_token_index, reverse_input_token_index, reverse_target_token_index = token_integer_mapping(train_do.input_tokens, train_do.target_tokens)

# Preprare data for model training
encoder_input_data, _, _ = prepare_model_data(test_do.input_lists, test_do.target_lists, input_token_index, target_token_index, n_input_samples_test, max_encoder_seq_length_test, max_decoder_seq_length_test, num_decoder_tokens_train)

In [3]:
# Print info
print("Training data info:-")
shape_info(n_input_samples_train, num_encoder_tokens_train, num_decoder_tokens_train, max_encoder_seq_length_train, max_decoder_seq_length_train)
print("====================")
print("Testing data info:-")
shape_info(n_input_samples_test, num_encoder_tokens_test, num_decoder_tokens_test, max_encoder_seq_length_test, max_decoder_seq_length_test)

Training data info:-
Number of samples: 5000
Number of unique input tokens: 4339
Number of unique output tokens: 4159
Max sequence length for inputs: 527
Max sequence length for outputs: 231
Testing data info:-
Number of samples: 200
Number of unique input tokens: 431
Number of unique output tokens: 547
Max sequence length for inputs: 47
Max sequence length for outputs: 82


## Loading the model

In [4]:
# Load the trained model
generator = load_model(model_path)
# print("model path:", model_path)
generator.summary()

W0825 19:35:08.632159 140669728122624 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0825 19:35:08.649503 140669728122624 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0825 19:35:08.651515 140669728122624 deprecation_wrapper.py:119] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0825 19:35:20.986823 140669728122624 deprecation.py:323] From /home/aziz/anaconda3/envs/tf/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.ar

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 3072)   13329408    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 3072)   12776448    input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

## Generate predictions

In [None]:
def decode_sequence(input_seq, model, max_decoder_seq_length, target_token_index, reverse_target_token_index):
    target_seq = np.zeros(shape=(len(input_seq), max_decoder_seq_length))
    # Populate the first character of target sequence with the start character.
    target_seq[:, 0] = target_token_index["<sop>"]
    for i in range(1, max_decoder_seq_length):
        prediction = model.predict([input_seq, target_seq]).argmax(axis=2)
        ###print(reverse_target_token_index[prediction[:, i][0]])
        if reverse_target_token_index[prediction[:, i][0]] == "<eop>":
            break
        target_seq[:, i] = prediction[:, i]
    decoded_sentence = []
    for idx in target_seq[:, 1:][0]:
        if idx == 0:
            break
        decoded_sentence.append(reverse_target_token_index[idx])
    return decoded_sentence

In [5]:
# Remove output file exists
if os.path.exists(result_dir+"testing_output.txt"):
    os.remove(result_dir+"testing_output.txt")
# Test samples from the beginning of the testing dataset
c = 1
predicted_lists = []
for seq_index in tqdm(range(n_input_samples_test)):
    # Take one sequence (part of the training set) for trying out decoding.
    input_seq = encoder_input_data[seq_index:seq_index+1]
    ###print('Input sentence: ' + test_do.input_texts[seq_index])
    input_seq2 = encoder_input_data[seq_index]
    input_list_tbp = []
    for i, idx in enumerate(input_seq2):
        if i == len(test_do.input_lists[seq_index]):
            break
        input_token = reverse_input_token_index[idx]
        input_list_tbp.append(input_token)
    to_print_out = ''
    for token in input_list_tbp:
        to_print_out += token + ' '
    ###print('Encoded sentence:\n' + to_print_out + '\n')

    ###print('Target sentence: ' + test_do.target_texts[seq_index])
    decoded_sentence = decode_sequence(input_seq, generator, max_decoder_seq_length_train, target_token_index, reverse_target_token_index)
    predicted_lists.append(decoded_sentence)
    to_print_out2 = ''
    for token in decoded_sentence:
        to_print_out2 += token + ' '
    ###print('Decoded sentence: ' + to_print_out)
    ###print('-')

    # Write output to file
    with open(result_dir+"testing_output.txt", "a") as f:
        f.write(str(c) + '.a) Input sentence:   ' + test_do.input_texts[seq_index] + "\n")
        f.write(str(c) + '.b) Encoded sentence: ' + to_print_out + "\n")
        f.write(str(c) + '.c) Target sentence:  ' + test_do.target_texts[seq_index] + "\n")
        f.write(str(c) + '.d) Decoded sentence: ' + to_print_out2 + "\n-\n")
    c += 1

# Save predicted lists to disk for result evaluation
with open(result_dir+'predicted_lists.pkl', 'wb') as f:  
    pickle.dump(predicted_lists, f)
print('Predicted lists are saved to disk')

100%|██████████| 200/200 [13:35<00:00,  4.22s/it]

Predicted lists are saved to disk



