In [39]:
import tensorflow as tf
from tensorflow.keras import models, activations
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Sequential, Model
import tensorflow.keras.backend as K
import numpy as np

def byte_me(input_string):
    """Converts the input string to an array of
    integers."""
    sl = 32 #sequence length
    b = bytearray()
    b.extend(input_string.encode())
    output = np.zeros(sl, dtype=np.uint8)
    result = np.array(b)[:sl]
    x = min(len(result), sl)
    output[:x] = result
    return output.reshape(1, -1)

In [42]:
def build_seq2seq_lstm(num_encoder_tokens, latent_dim, num_decoder_tokens=None, emb_dim=64):
    inp_char_embedder = Embedding(num_encoder_tokens, emb_dim)
    
    if num_decoder_tokens is not None:
        outp_char_embedder = Embedding(num_decoder_tokens, emb_dim)
    else:
        outp_char_embedder = inp_char_embedder
        num_decoder_tokens = num_encoder_tokens

    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(None,))
    embedded_encoder_inputs = inp_char_embedder(encoder_inputs)
    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(embedded_encoder_inputs)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None,))
    embedded_decoder_inputs = outp_char_embedder(decoder_inputs)


    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(embedded_decoder_inputs,
                                         initial_state=encoder_states)
    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    return model, encoder_states

num_encoder_tokens = 256
latent_dim = 256

autoencoder, states = build_seq2seq_lstm(num_encoder_tokens, latent_dim)

autoencoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 64)     16384       input_8[0][0]                    
                                                                 input_9[0][0]                    
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, 256), (None, 328704      embedding_5[0][0]                
__________

In [43]:
string = 'きa completely different example from the one I had been using before this allwent to shit'
out = byte_me(string)

out = np.concatenate([out, out])
autoencoder.predict([out, out])

array([[[0.00387643, 0.00391192, 0.00389487, ..., 0.00389812,
         0.00390685, 0.00391295],
        [0.00387639, 0.00389949, 0.00390452, ..., 0.00389897,
         0.00390226, 0.00390655],
        [0.00387394, 0.00390868, 0.00390311, ..., 0.00390289,
         0.00390836, 0.00391922],
        ...,
        [0.00391605, 0.00390067, 0.003907  , ..., 0.00390029,
         0.00390275, 0.00391897],
        [0.00390654, 0.00388774, 0.00390362, ..., 0.00390388,
         0.00390672, 0.00391707],
        [0.00391581, 0.0038845 , 0.00390939, ..., 0.0039181 ,
         0.00388666, 0.00392168]],

       [[0.00387643, 0.00391192, 0.00389487, ..., 0.00389812,
         0.00390685, 0.00391295],
        [0.00387639, 0.00389949, 0.00390452, ..., 0.00389897,
         0.00390226, 0.00390655],
        [0.00387394, 0.00390868, 0.00390311, ..., 0.00390289,
         0.00390836, 0.00391922],
        ...,
        [0.00391605, 0.00390067, 0.003907  , ..., 0.00390029,
         0.00390275, 0.00391897],
        [0.0

In [44]:
import pandas as pd
df = pd.read_csv('many_queries.csv')

In [45]:
df = df.fillna('')
df['lns'] = df['query'].apply(len)

In [46]:
df[['lns']].describe(percentiles=[.99])

Unnamed: 0,lns
count,21326630.0
mean,10.34532
std,6.876606
min,0.0
50%,9.0
99%,32.0
max,1418.0


In [47]:
vals = df['query'].dropna().values

In [48]:
rows = vals[:100000]
rows = [byte_me(x) for x in rows]

In [49]:
X = np.concatenate(rows)
X.shape

(100000, 32)

In [50]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(X, test_size=.01)

In [53]:
from tensorflow.keras.utils import to_categorical

charbank = 'abcdefghijklmnopqrstuvwxyz'
charbank = charbank + charbank.upper()
charbank = charbank + '0123456789'

def deletion(string):
    """Performs a random deletion of input string"""
    to_del = np.random.randint(0, high=len(string))
    return string[:to_del] + string[to_del+1:]

def insertion(string, charbank=charbank):
    """Performs a random insertion into input string"""
    to_ins = np.random.randint(0, high=len(string))
    char = np.random.choice(list(charbank))
    return string[:to_ins] + char + string[to_ins:]

def swap(string):
    """swaps two consecutive characters in string."""
    to_swap = np.random.randint(0, high=(len(string)-1))
    return string[:to_swap] + string[to_swap+1] + string[to_swap] + string[to_swap+2:]

def apply_noise(string):
    """Randomly applies one type of noise."""
    if len(string) < 2:
        return string
    func = np.random.choice([deletion, insertion, swap])
    return func(string)

def random_gen(batch_size=32):
    """
    For training the identity function.
    Generates random sequences.
    """
    sl = 32 #sequence length
    while True:
        X_out = np.random.randint(0, high=256, size=(batch_size, sl))
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out


def data_gen(X, batch_size=32):
    """
    For training the identity function on real queries.
    Generates identity samples of queries.
    """
    sl = 32 #sequence length
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        X_dec = np.zeros(shape=X_out.shape, dtype=X_out.dtype)
        X_dec[:, 1:] = X_out[:, :-1]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield [X_out, X_dec], Y_out
        
def noise_gen(X, batch_size=32):
    """
    For training the identity function on real queries.
    Generates identity samples of queries.
    """
    sl = 32 #sequence length
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        X_ = []
        for i, x in enumerate(X_out):
            try:
                x = bytearray(x).split(b'\0',1)[0].decode()
            except UnicodeDecodeError:
                #remove examples where bad unicode.
                Y_out = np.concatenate([Y_out[:i], Y_out[i+1:]], axis=0)
                continue
            x = apply_noise(x)
            x = byte_me(x)
            X_.append(x)
        X_out = np.concatenate(X_)
        #todo: debug cases where len(x) != len(y)
        if len(X_out) == len(Y_out):
            yield X_out, Y_out
        else:
            pass

        
trg = data_gen(train)
teg = data_gen(val)
gen = random_gen()
ntrg = noise_gen(train)
nteg = noise_gen(val)

In [54]:
a, b = next(trg)

In [55]:
a[0][0]

array([ 74, 111, 114, 100,  97, 110,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0], dtype=uint8)

In [None]:
#identity step

from tensorflow.keras import optimizers

autoencoder.compile(
    optimizer=optimizers.Adam(lr=.001), 
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
autoencoder.fit_generator(
    generator=trg,
    validation_data=teg,
    steps_per_epoch=512,
    validation_steps=100,
    epochs=5
)

Epoch 1/5

In [28]:
def test_model(model, example='testthisthang'):
    example = example
    inp = byte_me(example)
    print(inp)
    out = model.predict(inp)
    out = np.argmax(out, axis=2).astype(np.uint8)
    print(out)
    return encode_output(out)

def encode_output(array):
    """
    encodes neural network output to unicode.
    """
    try:
        return bytearray(array).split(b'\0',1)[0].decode()
    except UnicodeDecodeError:
        i = 1
        while True:
            try:
                return bytearray(array).split(b'\0',1)[0][:-i].decode()
            except UnicodeDecodeError:
                i+=1
                
test_model(autoencoder, 'test this string')

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[ 26  13 210 217  32  88 220 139 145 225 112 180 114  86 186 220   0   0
    7   0 243  15   0  97   0  31  31  31   0   7 169 254]]


'\x1a\r'

In [29]:
#no noise

autoencoder.fit_generator(
    generator=trg,
    validation_data=teg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=1
)



<tensorflow.python.keras.callbacks.History at 0x7fe72c4246d8>

In [30]:
test_model(autoencoder, 'test this string out see what you get')

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103  32 111
  117 116  32 115 101 101  32 119 104  97 116  32 121 111]]
[[ 69 101  42 118  32 184  67 239 115  32 115 180 114 105 110 103  55 111
  117 116  32 107  97 139  48 158 104  97  66  32  29 111]]


'Ee*v '

In [31]:
#noise

autoencoder.fit_generator(
    generator=ntrg,
    validation_data=nteg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100


Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100


Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7fe72c4242e8>

In [35]:
np.random.randint(3, size=(100,))

array([0, 0, 1, 1, 0, 2, 2, 0, 1, 2, 2, 1, 1, 2, 1, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 0, 1, 0, 2, 2, 2, 1, 1, 0, 0, 0, 2, 2, 0, 1, 2, 1, 1, 1, 1,
       0, 1, 0, 2, 0, 2, 0, 2, 2, 1, 0, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2,
       0, 0, 1, 0, 2, 2, 2, 1, 0, 2, 1, 1, 2, 2, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 2, 2, 0, 1, 0])

In [32]:
print(test_model(autoencoder, 'test this string out see what you get'))
print(test_model(autoencoder, 'iarmax 365'))
print(test_model(autoencoder, 'give me krie'))

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103  32 111
  117 116  32 115 101 101  32 119 104  97 116  32 121 111]]
[[ 84   1  97 116 116  32 105 105 112 115 112 115 105 105 110 103 103 116
  114 116  32  32 104 104 114  32  32  97  97 101   0 101]]
Tatt iipspsiinggtrt  hhr  aae
[[105  97 114 109  97 120  32  51  54  53   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[ 97 105 114 109  97   1  32  51  53  53   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
airma 355
[[103 105 118 101  32 109 101  32 107 114 105 101   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[103 105   1  98  32 109 101  32 101 114 101 101   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
gib me eree
