In [1]:
import tensorflow as tf
from tensorflow.keras import models, layers, activations
from tensorflow.keras.models import Sequential, Model
import tensorflow.keras.backend as K
import numpy as np

def byte_me(input_string):
    """Converts the input string to an array of
    integers."""
    sl = 32 #sequence length
    b = bytearray()
    b.extend(input_string.encode())
    output = np.zeros(sl, dtype=np.uint8)
    result = np.array(b)[:sl]
    x = min(len(result), sl)
    output[:x] = result
    return output.reshape(1, -1)

In [26]:
sl = 32 #sequence length

inp = layers.Input(shape=(sl,), dtype=tf.uint8)
x = layers.Embedding(input_dim=256, output_dim=64, input_length=64)(inp)
x = layers.Reshape((2048,))(x)
#x = layers.Dropout(.2)(x)
x = layers.Dense(1024, activation='softsign')(x)
x = layers.Dense(1024, activation='softsign')(x)
x = layers.Dense(1024, activation='softsign')(x)
#x = layers.Dropout(.2)(x)
x = layers.Dense(1024, activation='softsign')(x)
x = layers.Reshape((32, 32))(x)
x = layers.Conv1D(filters=256, kernel_size=1, activation=(lambda x: activations.softmax(x, axis=1)))(x)


autoencoder = models.Model(inputs=inp, outputs=x)
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 32)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 32, 64)            16384     
_________________________________________________________________
reshape_4 (Reshape)          (None, 2048)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dense_7 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_8 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_9 (Dense)              (None, 1024)              1049600   
__________

In [4]:
string = 'きa completely different example from the one I had been using before this allwent to shit'
out = byte_me(string)

out = np.concatenate([out, out])
autoencoder.predict(out)

array([[[0.03105543, 0.03188496, 0.0315535 , ..., 0.03190678,
         0.03119541, 0.03067248],
        [0.03149303, 0.03128268, 0.03171589, ..., 0.03093702,
         0.03091418, 0.0308835 ],
        [0.03106962, 0.03081398, 0.03052501, ..., 0.03119929,
         0.03201878, 0.03141157],
        ...,
        [0.03154055, 0.03171684, 0.03163086, ..., 0.03081964,
         0.03090471, 0.03054261],
        [0.03135443, 0.03113623, 0.03108686, ..., 0.0311783 ,
         0.03131009, 0.0312447 ],
        [0.03110107, 0.03147691, 0.03189255, ..., 0.03142247,
         0.03141307, 0.03184773]],

       [[0.03105543, 0.03188496, 0.0315535 , ..., 0.03190678,
         0.03119541, 0.03067248],
        [0.03149303, 0.03128268, 0.03171589, ..., 0.03093702,
         0.03091418, 0.0308835 ],
        [0.03106962, 0.03081398, 0.03052501, ..., 0.03119929,
         0.03201878, 0.03141157],
        ...,
        [0.03154055, 0.03171684, 0.03163086, ..., 0.03081964,
         0.03090471, 0.03054261],
        [0.0

In [5]:
import pandas as pd
df = pd.read_csv('many_queries.csv')

In [6]:
df = df.fillna('')
df['lns'] = df['query'].apply(len)

In [7]:
df[['lns']].describe(percentiles=[.99])

Unnamed: 0,lns
count,21326630.0
mean,10.34532
std,6.876606
min,0.0
50%,9.0
99%,32.0
max,1418.0


In [8]:
vals = df['query'].dropna().values

In [9]:
rows = vals[:1000000]
rows = [byte_me(x) for x in rows]

In [10]:
X = np.concatenate(rows)
X.shape

(1000000, 32)

In [11]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(X, test_size=.01)

In [21]:
from tensorflow.keras.utils import to_categorical

charbank = 'abcdefghijklmnopqrstuvwxyz'
charbank = charbank + charbank.upper()
charbank = charbank + '0123456789'

def deletion(string):
    """Performs a random deletion of input string"""
    to_del = np.random.randint(0, high=len(string))
    return string[:to_del] + string[to_del+1:]

def insertion(string, charbank=charbank):
    """Performs a random insertion into input string"""
    to_ins = np.random.randint(0, high=len(string))
    char = np.random.choice(list(charbank))
    return string[:to_ins] + char + string[to_ins:]

def swap(string):
    """swaps two consecutive characters in string."""
    to_swap = np.random.randint(0, high=(len(string)-1))
    return string[:to_swap] + string[to_swap+1] + string[to_swap] + string[to_swap+2:]

def apply_noise(string):
    """Randomly applies one type of noise."""
    if len(string) < 2:
        return string
    func = np.random.choice([deletion, insertion, swap])
    return func(string)

def random_gen(batch_size=32):
    """
    For training the identity function.
    Generates random sequences.
    """
    sl = 32 #sequence length
    while True:
        X_out = np.random.randint(0, high=256, size=(batch_size, sl))
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out


def data_gen(X, batch_size=32):
    """
    For training the identity function on real queries.
    Generates identity samples of queries.
    """
    sl = 32 #sequence length
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out
        
def noise_gen(X, batch_size=32):
    """
    For training the identity function on real queries.
    Generates identity samples of queries.
    """
    sl = 32 #sequence length
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        X_ = []
        for i, x in enumerate(X_out):
            try:
                x = bytearray(x).split(b'\0',1)[0].decode()
            except UnicodeDecodeError:
                #remove examples where bad unicode.
                Y_out = np.concatenate([Y_out[:i], Y_out[i+1:]], axis=0)
                continue
            x = apply_noise(x)
            x = byte_me(x)
            X_.append(x)
        X_out = np.concatenate(X_)
        #todo: debug cases where len(x) != len(y)
        if len(X_out) == len(Y_out):
            yield X_out, Y_out
        else:
            pass

        
trg = data_gen(train)
teg = data_gen(val)
gen = random_gen()
ntrg = noise_gen(train)
nteg = noise_gen(val)

In [27]:
#identity step

from tensorflow.keras import optimizers

autoencoder.compile(optimizer=optimizers.Adam(lr=.001), loss='categorical_crossentropy')
autoencoder.fit_generator(
    generator=gen,
    steps_per_epoch=4096,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe72c424048>

In [28]:
def test_model(model, example='testthisthang'):
    example = example
    inp = byte_me(example)
    print(inp)
    out = model.predict(inp)
    out = np.argmax(out, axis=2).astype(np.uint8)
    print(out)
    return encode_output(out)

def encode_output(array):
    """
    encodes neural network output to unicode.
    """
    try:
        return bytearray(array).split(b'\0',1)[0].decode()
    except UnicodeDecodeError:
        i = 1
        while True:
            try:
                return bytearray(array).split(b'\0',1)[0][:-i].decode()
            except UnicodeDecodeError:
                i+=1
                
test_model(autoencoder, 'test this string')

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[ 26  13 210 217  32  88 220 139 145 225 112 180 114  86 186 220   0   0
    7   0 243  15   0  97   0  31  31  31   0   7 169 254]]


'\x1a\r'

In [29]:
#no noise

autoencoder.fit_generator(
    generator=trg,
    validation_data=teg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=1
)



<tensorflow.python.keras.callbacks.History at 0x7fe72c4246d8>

In [30]:
test_model(autoencoder, 'test this string out see what you get')

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103  32 111
  117 116  32 115 101 101  32 119 104  97 116  32 121 111]]
[[ 69 101  42 118  32 184  67 239 115  32 115 180 114 105 110 103  55 111
  117 116  32 107  97 139  48 158 104  97  66  32  29 111]]


'Ee*v '

In [None]:
#noise

autoencoder.fit_generator(
    generator=ntrg,
    validation_data=nteg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

In [24]:
print(test_model(autoencoder, 'test this string out see what you get'))
print(test_model(autoencoder, 'iarmax 365'))
print(test_model(autoencoder, 'give me krie'))

[[116 101 115 116  32 116 104 105 115  32 115 116 114 105 110 103  32 111
  117 116  32 115 101 101  32 119 104  97 116  32 121 111]]
[[  3 101 101 115 116  32 104 117 115   0 115 114 114 105 110 103  32 111
  117 117 116 101 101 101  32  97 104 104 116 104 111 105]]
eest hus
[[105  97 114 109  97 120  32  51  54  53   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[  3 105   1 109  97 120  32  54  51   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
imax 63
[[103 105 118 101  32 109 101  32 107 114 105 101   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
[[  3 105 118 101  32 109 101  32 107   1 114 101   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
ive me kre
