In [1]:
import tensorflow as tf
from tensorflow.keras import models, layers, activations
from tensorflow.keras.models import Sequential, Model
import tensorflow.keras.backend as K
import numpy as np

def byte_me(input_string):
    """Converts the input string to an array of
    integers."""
    b = bytearray()
    b.extend(input_string.encode())
    return np.array(b)

In [18]:
inp = layers.Input(shape=(64,), dtype=tf.uint8)
x = layers.Embedding(input_dim=256, output_dim=64, input_length=64)(inp)
x = layers.Conv1D(filters=64, kernel_size=3, activation='relu')(x)
x = layers.Conv1D(filters=64, kernel_size=3, activation='relu', strides=2)(x)

x = layers.Reshape(target_shape=(-1, 1920))(x)
#x = layers.Dense(256, activation='relu')(x)

#"bottleneck" layer
#embedding = layers.Dense(256)(x)

#x = layers.Dense(256, activation='relu')(x)

x = layers.Dense(2048, activation='relu')(x)
x = layers.Reshape((64, 32))(x)
x = layers.Conv1D(filters=256, kernel_size=1, activation=(lambda x: activations.softmax(x, axis=1)))(x)


autoencoder = models.Model(inputs=inp, outputs=x)
autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 64)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 64, 64)            16384     
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 62, 64)            12352     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 30, 64)            12352     
_________________________________________________________________
reshape_6 (Reshape)          (None, None, 1920)        0         
_________________________________________________________________
dense_7 (Dense)              (None, None, 2048)        3934208   
_________________________________________________________________
reshape_7 (Reshape)          (None, 64, 32)            0         
__________

In [6]:
def byte_me(input_string):
    """Converts the input string to an array of
    integers."""
    b = bytearray()
    b.extend(input_string.encode())
    output = np.zeros(64, dtype=np.uint8)
    result = np.array(b)[:64]
    x = min(len(result), 64)
    output[:x] = result
    return output.reshape(1, -1)

string = 'きa completely different example from the one I had been using before this allwent to shit'
out = byte_me(string)

out = np.concatenate([out, out])
autoencoder.predict(out)

array([[[0.0156486 , 0.01561856, 0.01558934, ..., 0.01570393,
         0.01559692, 0.01554755],
        [0.01561294, 0.01559056, 0.01562465, ..., 0.01561609,
         0.01559465, 0.01563474],
        [0.01567719, 0.01556424, 0.01561314, ..., 0.01554751,
         0.01562736, 0.01566313],
        ...,
        [0.0156566 , 0.01563647, 0.01561924, ..., 0.01566086,
         0.01559545, 0.01555576],
        [0.0155953 , 0.01571142, 0.01565355, ..., 0.01561982,
         0.0156517 , 0.01562703],
        [0.01552223, 0.01566094, 0.01563862, ..., 0.01564148,
         0.01564958, 0.0156145 ]],

       [[0.0156486 , 0.01561856, 0.01558934, ..., 0.01570393,
         0.01559692, 0.01554755],
        [0.01561294, 0.01559056, 0.01562465, ..., 0.01561609,
         0.01559465, 0.01563474],
        [0.01567719, 0.01556424, 0.01561314, ..., 0.01554751,
         0.01562736, 0.01566313],
        ...,
        [0.0156566 , 0.01563647, 0.01561924, ..., 0.01566086,
         0.01559545, 0.01555576],
        [0.0

In [7]:
import pandas as pd
df = pd.read_csv('many_queries.csv')

In [8]:
vals = df['query'].fillna('').values

In [9]:
blob = ''.join(vals)

In [10]:
text = blob

In [11]:
chunks = len(text)//64
print('{} chunks.'.format(chunks))
rows = []

for i in range(chunks):
    rows.append(byte_me(text[i*64:(i+1)*64]))

3447357 chunks.


In [12]:
X = np.concatenate(rows)
X.shape

(3447357, 64)

In [13]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(X, test_size=.01)

In [14]:
from tensorflow.keras.utils import to_categorical


def data_gen(X=train, batch_size=32):
    while True:
        idx = np.random.randint(len(X), size=(batch_size))
        X_out = X[idx]
        Y_out = np.array([to_categorical(x, num_classes=256) for x in X_out])
        yield X_out, Y_out
        
trg = data_gen(train)
teg = data_gen(val)

In [22]:
from tensorflow.keras import optimizers

autoencoder.compile(optimizer=optimizers.Adam(lr=.001), loss='categorical_crossentropy')
autoencoder.fit_generator(
    generator=trg,
    validation_data=teg,
    steps_per_epoch=4096,
    validation_steps=42,
    epochs=100
)

Epoch 1/100
 237/4096 [>.............................] - ETA: 3:48 - loss: 0.3101

KeyboardInterrupt: 

In [40]:
def test_model(model, example='testthisthang'):
    example = example * 8
    inp = byte_me(example)
    out = model.predict(inp)[0]
    out = np.argmax(out, axis=1)
    print(out)
    return encode_output(out)

def encode_output(array):
    """
    encodes neural network output to unicode.
    """
    try:
        return bytearray(array).decode()
    except:
        return bytearray([115] + array.tolist()).decode()

In [42]:
test_model(autoencoder)
#bytearray([116, 101, 115, 116, 116, 104, 105, 115, 116, 104, 97, 110, 103]).decode()

[116 101 115 116 116 104 105 115 116 104  97 110 103 116 101 115 116 116
 104 105 115 116 104  97 110 103 116 101 115 116 116 104 105 115 116 104
  97   2 103 116 101 115 116 116 104 105   0 116 104  97 110 103 116 101
 115 116 116 104 105 115 116 104  97 111]


't\x00\x00\x00\x00\x00\x00\x00e\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00\x00\x00\x00\x00i\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00\x00\x00\x00\x00a\x00\x00\x00\x00\x00\x00\x00n\x00\x00\x00\x00\x00\x00\x00g\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00e\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00\x00\x00\x00\x00i\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00\x00\x00\x00\x00a\x00\x00\x00\x00\x00\x00\x00n\x00\x00\x00\x00\x00\x00\x00g\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00e\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00\x00\x00\x00\x00h\x00\x00\x00\x00\x00\x00\x00i\x00\x00\x00\x00\x00\x00\x00s\x00\x00\x00\x00\x00\x00\x00t\x00\x00\x00

In [58]:
byte_me('some text here')

array([[115, 111, 109, 101,  32, 116, 101, 120, 116,  32, 104, 101, 114,
        101,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]],
      dtype=uint8)