In [4]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [5]:
import string
import copy
import random
import datetime
import math

import numpy as np

import tensorflow as tf

np.set_printoptions(threshold=np.inf)

2023-01-16 15:15:44.343697: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-16 15:15:44.664191: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-16 15:15:45.896059: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-16 15:15:45.896761: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [6]:
words = np.loadtxt("words100.txt", dtype=str, delimiter=" ", encoding="utf-8")
len(words)

100

In [48]:
class WordsDataset(tf.keras.utils.Sequence):
    padding_char = " "

    def __init__(self, phase, words, batch_size = 100, alphabet = list(string.ascii_lowercase)):
        self.phase = phase
        self.words = copy.deepcopy(words)
        self.batch_size = batch_size
        self.alphabet = alphabet
        self.one_hot_encoding_model = tf.keras.models.Sequential(
            [
                tf.keras.Input(shape=(1,), dtype=tf.string),
                tf.keras.layers.TextVectorization(
                    output_mode="multi_hot",
                    vocabulary=self.alphabet)
            ]
        )
        self.longest_word = max(words, key=len)

    def __len__(self):
        return math.ceil(len(self.words) / self.batch_size)

    def __getitem__(self, idx):
        batch_x_rand = self.words[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_x = self.words[idx * self.batch_size:(idx + 1) * self.batch_size]
        # x_prim = self.change_random_char(x_prim)
        # x_prim = self.encode(x_prim)
        x_rand = np.array([
            self.encode(self.change_random_char(x)) for x in batch_x_rand
        ])
        x = np.array([
            self.encode(x) for x in batch_x
        ])
        return x_rand, x
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()

    # shuffles the dataset at the end of each epoch
    def on_epoch_end(self):
        reidx = random.sample(population = list(range(self.__len__())),k = self.__len__())
        self.words = self.words[reidx]

    def random_char(self, exclude):
        char = random.choice(self.alphabet)
        return self.random_char(exclude) if char == exclude else char
    
    def change_random_char(self, word):
        idx = random.randint(0, len(word) - 1)
        char = self.random_char(word[idx])
        return word[:idx] + char + word[idx+1:]

    # splits word into list of characters
    def split_word(self, x):
        return list(x)

    # pads 
    def pad(self, x):
        for _ in range(len(self.longest_word) - len(x)):
            x.append(self.padding_char)
        return x
    
    # performs one-hot encoding on x
    def encode(self, x):
        x = self.split_word(x)
        x = self.pad(x)
        x = np.array(self.one_hot_encoding_model.predict(x, verbose=0).reshape(1,-1))
        return x

    def decode(self, x):
        x = x.reshape(len(self.longest_word), len(self.alphabet) + 1)
        return x

In [49]:
training_generator = WordsDataset("train", words, batch_size=32)
validation_generator = WordsDataset("validation", words, batch_size=32)

In [9]:
longest_word = max(words, key=len)
print(f"Longest word: {longest_word}")
print(f"Longest word length: {len(longest_word)}")

Longest word: destruction
Longest word length: 11


In [10]:
alphabet = list(string.ascii_lowercase)
print(alphabet)
print(len(alphabet))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
26


In [22]:
output_size = (len(string.ascii_lowercase) + 1) * len(longest_word)
print(f"Input/output size = {output_size}")

latent_dim = 30

class Autoencoder(tf.keras.Model):
    def __init__(self, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(output_size),
            tf.keras.layers.Dense(latent_dim, activation="relu")
        ])
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(output_size, activation="sigmoid")
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder(latent_dim)

# autoencoder = tf.keras.models.Sequential(
#     [
#         tf.keras.layers.Input(shape=(input_size,)),
#         tf.keras.layers.Dense(hidden_layer_size, activation="relu"),
#         tf.keras.layers.Dense(input_size, activation="sigmoid")
#     ]
# )

autoencoder.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

Input/output size = 297


In [27]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

autoencoder.fit(
    training_generator,
    epochs=100,
    batch_size=32,
    shuffle=True,
    validation_data=validation_generator,
    callbacks=[tensorboard_callback])

Epoch 1/100
['after']
Epoch 2/100
['after']
Epoch 3/100
['after']
Epoch 4/100
['after']
Epoch 5/100
['after']
Epoch 6/100
['after']
Epoch 7/100
['after']
Epoch 8/100
['after']
Epoch 9/100
['after']
Epoch 10/100
['after']
Epoch 11/100
['after']
Epoch 12/100
['after']
Epoch 13/100
['after']
Epoch 14/100
['after']
Epoch 15/100
['after']
Epoch 16/100
['after']
Epoch 17/100
['after']
Epoch 18/100
['after']
Epoch 19/100
['after']
Epoch 20/100
['after']
Epoch 21/100
['after']
Epoch 22/100
['after']
Epoch 23/100
['after']
Epoch 24/100
['after']
Epoch 25/100
['after']
Epoch 26/100
['after']
Epoch 27/100
['after']
Epoch 28/100
['after']
Epoch 29/100
['after']
Epoch 30/100
['after']
Epoch 31/100
['after']
Epoch 32/100
['after']
Epoch 33/100
['after']
Epoch 34/100
['after']
Epoch 35/100
['after']
Epoch 36/100
['after']
Epoch 37/100
['after']
Epoch 38/100
['after']
Epoch 39/100
['after']
Epoch 40/100
['after']
Epoch 41/100
['after']
Epoch 42/100
['after']
Epoch 43/100
['after']
Epoch 44/100
['after

<keras.callbacks.History at 0x7fb1b5be0bb0>

In [28]:
end_encoded = training_generator.encode("end")
end_encoded_reshaped = end_encoded.reshape(-1,)
end_decoded = training_generator.decode(end_encoded)
# lookup = tf.keras.layers.StringLookup(vocabulary=training_generator.one_hot_encoding_model.get_layer("text_vectorization_6").get_vocabulary(), invert=True)
# vocab = training_generator.one_hot_encoding_model.get_layer("text_vectorization_6").get_vocabulary()
# print(vocab)
print(end_encoded)
# print(end_encoded_reshaped)
# print(end_decoded)
# print(end_encoded==end_decoded)
# print(lookup)


[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [52]:
prediction = autoencoder.predict(training_generator.encode("farts"))
print(prediction)
# autoencoder.summary()
# autoencoder.predict(training_generator.encode("farts"))

[[0.24785538 0.7464843  0.23704754 0.28257364 0.23298262 0.2643738
  0.2604562  0.25370482 0.2634375  0.23266163 0.26129532 0.25558382
  0.2724916  0.22224587 0.2859145  0.25303578 0.22898032 0.27738267
  0.24465336 0.24342085 0.26277605 0.227334   0.2854598  0.2613166
  0.27125108 0.25013727 0.24037239 0.25242922 0.25554422 0.2521487
  0.24242331 0.25304204 0.25296202 0.72087216 0.25176314 0.2450378
  0.24288566 0.26151878 0.2637278  0.2215964  0.2575819  0.2121628
  0.2526189  0.25425357 0.254407   0.26619726 0.27523348 0.2424623
  0.2410545  0.2598795  0.26659772 0.2045465  0.27393317 0.21071509
  0.25972825 0.25095633 0.26119855 0.2421889  0.2751087  0.24005279
  0.25717932 0.2045456  0.26519296 0.24588034 0.2698602  0.23477662
  0.26288158 0.26720518 0.24796736 0.22912897 0.27689356 0.2527421
  0.26855356 0.24849445 0.7595825  0.25248688 0.23844019 0.24783733
  0.22396268 0.2504417  0.27177778 0.2491801  0.2456874  0.22381173
  0.24825448 0.27242923 0.77221584 0.22967567 0.251034 

In [None]:
# x_train = []
# y_train = []

# x_test = []
# y_test = []

# for i in range(10):
#     for x, y in training_generator:
#         x_train.append(x)
#         y_train.append(y)

# for i in range(2):
#     for x, y in validation_generator:
#         x_test.append(x)
#         y_test.append(y)

# x_train = np.asarray(x_train)
# y_train = np.asarray(y_train)
# x_test = np.asarray(x_test)
# y_test = np.asarray(y_test)

In [None]:
# char_tokenizer = tf.keras.preprocessing.text.Tokenizer(
#     num_words=len(alphabet),
#     oov_token=" ",
#     char_level=True
# )
# char_tokenizer.fit_on_texts(alphabet)

# sequences = char_tokenizer.texts_to_sequences(alphabet)

# one_hot_chars = char_tokenizer.texts_to_matrix(alphabet, mode="binary")

# char_index = char_tokenizer.word_index
# print('Found %s unique tokens.' % len(char_index))
# print(char_index)

# print(char_tokenizer.texts_to_matrix(words[0], mode="binary"))

In [None]:
# one_hot_encoding_model = tf.keras.models.Sequential(
#     [
#         tf.keras.Input(shape=(1,), dtype=tf.string),
#         tf.keras.layers.TextVectorization(
#             output_mode="multi_hot",
#             vocabulary=alphabet
#         )
#     ]
# )
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])


In [None]:
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])
# print(len(one_hot_encoded[0]))
# print(words[0])
# print(one_hot_encoded[0])

In [None]:
# tf.keras.models.Sequential(
#             [
#                 tf.keras.Input(shape=(1,), dtype=tf.string),
#                 tf.keras.layers.TextVectorization(
#                     output_mode="multi_hot",
#                     vocabulary=self.alphabet)
#             ]
#         )