In [6]:
import tensorflow as tf
from tensorflow.keras import models, layers, activations
from tensorflow.keras.models import Sequential, Model
import tensorflow.keras.backend as K
import numpy as np

def byte_me(input_string):
    """Converts the input string to an array of
    integers."""
    b = bytearray()
    b.extend(input_string.encode())
    output = np.zeros(64, dtype=np.uint8)
    result = np.array(b)[:64]
    x = min(len(result), 64)
    output[:x] = result
    return output.reshape(1, -1)

In [19]:
inp = layers.Input(shape=(64,), dtype=tf.uint8)
x = layers.Embedding(input_dim=256, output_dim=64, input_length=64)(inp)
x = layers.Dropout(.2)(x)
x = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = layers.Conv1D(filters=64, kernel_size=3, activation='relu', strides=2)(x)

x = layers.Reshape(target_shape=(-1, 1920))(x)
x = layers.Dropout(.2)(x)
#x = layers.Dense(256, activation='relu')(x)

#"bottleneck" layer
#embedding = layers.Dense(256)(x)

#x = layers.Dense(256, activation='relu')(x)

x = layers.Dense(2048, activation='tanh')(x)
x = layers.Reshape((64, 32))(x)
x = layers.Conv1D(filters=256, kernel_size=1, activation=(lambda x: activations.softmax(x, axis=1)))(x)


autoencoder = models.Model(inputs=inp, outputs=x)
autoencoder.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 64, 64)            16384     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 64)            0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 62, 64)            12352     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 30, 64)            12352     
_________________________________________________________________
reshape_6 (Reshape)          (None, None, 1920)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 1920)        0   

In [8]:
string = 'きa completely different example from the one I had been using before this allwent to shit'
out = byte_me(string)

out = np.concatenate([out, out])
autoencoder.predict(out)

array([[[0.01552029, 0.01566873, 0.01559961, ..., 0.01564367,
         0.01562447, 0.01570763],
        [0.01571059, 0.01574207, 0.015644  , ..., 0.01569359,
         0.0156624 , 0.01558378],
        [0.01569369, 0.01550022, 0.01547854, ..., 0.0154737 ,
         0.01556558, 0.01563093],
        ...,
        [0.01555829, 0.01568589, 0.01560984, ..., 0.01568319,
         0.0156093 , 0.01556973],
        [0.01568218, 0.01562481, 0.01572322, ..., 0.01578906,
         0.01581811, 0.01560044],
        [0.01558359, 0.01556862, 0.01562321, ..., 0.01572043,
         0.01570599, 0.01559613]],

       [[0.01552029, 0.01566873, 0.01559961, ..., 0.01564367,
         0.01562447, 0.01570763],
        [0.01571059, 0.01574207, 0.015644  , ..., 0.01569359,
         0.0156624 , 0.01558378],
        [0.01569369, 0.01550022, 0.01547854, ..., 0.0154737 ,
         0.01556558, 0.01563093],
        ...,
        [0.01555829, 0.01568589, 0.01560984, ..., 0.01568319,
         0.0156093 , 0.01556973],
        [0.0

In [31]:
import pandas as pd
df = pd.read_csv('many_queries.csv')

In [32]:
vals = df['query'].fillna('').values

In [33]:
blob = ''.join(vals)

In [34]:
text = blob

In [35]:
chunks = len(text)//64
print('{} chunks.'.format(chunks))
rows = []

for i in range(chunks):
    rows.append(byte_me(text[i*64:(i+1)*64]))

3447357 chunks.


In [36]:
X = np.concatenate(rows)
X.shape

(3447357, 64)

In [37]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(X, test_size=.01)

In [38]:
from tensorflow.keras.utils import to_categorical

def random_gen(batch_size=32):
    while True:
        X_out = np.random.randint(0, high=256, size=(batch_size, 64))
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out


def data_gen(X, batch_size=32):
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out
        

        
trg = data_gen(train)
teg = data_gen(val)
gen = random_gen()

In [25]:
#identity step

from tensorflow.keras import optimizers

autoencoder.compile(optimizer=optimizers.Adam(lr=.001), loss='categorical_crossentropy')
autoencoder.fit_generator(
    generator=gen,
    steps_per_epoch=4096,
    epochs=15
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

KeyboardInterrupt: 

In [39]:
#spaceless step

autoencoder.fit_generator(
    generator=trg,
    validation_data=teg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

KeyboardInterrupt: 

In [89]:
autoencoder.save('./models/')

(21326631,)

In [87]:
x.shape

(32, 64)

In [76]:
def test_model(model, example='testthisthang'):
    example = example * 8
    inp = byte_me(example)
    out = model.predict(inp)
    out = np.argmax(out, axis=2).astype(np.uint8)
    return encode_output(out)
    stop = np.argwhere(out==0).flatten()
    return stop
    res = out[:stop]
    print(res)
    return encode_output(res)

def encode_output(array):
    """
    encodes neural network output to unicode.
    """
    return bytearray(array).decode()
    try:
        return bytearray(array).decode()
    except:
        return bytearray([115] + array.tolist()).decode()
    
test_model(autoencoder, example='anotheringstinrgdsfasdfkjasdhflasflsdjflakdfdfgfffffffffffffffffffffffffffffffffffsdfd')

'anotheringstinrgdsfasdfkjasdhflasflsdjflakdfdfgffffffffffffff\x00fo'

In [22]:
test_model(autoencoder)
#bytearray([116, 101, 115, 116, 116, 104, 105, 115, 116, 104, 97, 110, 103]).decode()

[196  91 115 179 124 104 105 115 124 104  15 110 223 184 107 115  92 232
 104 105 115 169 104 167   9 192  60 107 115 204 176 104 105 115  60 104
 166   9 103 200 107 115 176 176 104 105 115  33 104 220 240 128  34 136
 115  37  71 104 105 115 124  83 114   2]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc4 in position 1: invalid continuation byte

In [58]:
byte_me('some text here')

array([[115, 111, 109, 101,  32, 116, 101, 120, 116,  32, 104, 101, 114,
        101,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=uint8)