In [None]:
from math import sqrt
import os
import string

import numpy as np

import seq2seq_base
import simple_char_fe as simple_fe

In [None]:
MAX_LEN = 150
SNIPPET_LEN = 40
folder = "/usr/local/lib/python3.5/dist-packages/"


# func to estimate number of samples from number of lines in files
def n_samples(n_lines):
    return sqrt(n_lines)

simple_fe.stat(folder, estimator_n_samples=n_samples)

In [None]:
# Prepare features
feature_extraction_pipe = simple_fe.feature_extraction_pipe
[before_vec, after_vec], snippet_vec, char_id = feature_extraction_pipe(
    folder_loc=folder, max_len=MAX_LEN, snippet_len=SNIPPET_LEN, n_lines=10,
    text_to_ind_vec=True, estimator_n_samples=n_samples,
    token_ind=dict((ch, i + 1) for i, ch in enumerate(string.printable)))

snippet_in = snippet_vec[:,:-1]
snippet_target = snippet_vec[:, 1:]

In [None]:
# Prepare model
res = seq2seq_base.prepare_seq2seq_model(token2ind=char_id,
                                         encoder_seq_len=MAX_LEN,
                                         decoder_seq_len=SNIPPET_LEN)
train_model, decode_sequence, ind2token = res

In [None]:
# Train model
batch_size = 32
epochs = 50
train_model.fit([before_vec, snippet_in],
                np.expand_dims(snippet_target, axis=-1),
                batch_size=batch_size,
                epochs=epochs,
                validation_split=0.2)

In [None]:
# Inference step for several samples
for seq_index in range(100):
    # Take one sequence (part of the training test)
    # for trying out decoding.
    input_seq = before_vec[seq_index: seq_index + 1]
    input_sent = "".join([ind2token.get(ind, "")
                          for ind in input_seq[0]]).strip()
    decoded_sentence = decode_sequence(input_seq)
    print('Input sentence:', input_sent)
    print('Decoded sentence:', decoded_sentence)
    print("-" * 20)