In [1]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [2]:
import string
import copy
import random
import datetime
import math

import numpy as np

import tensorflow as tf

np.set_printoptions(threshold=np.inf)

2023-01-18 02:54:21.101463: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-18 02:54:21.196225: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-18 02:54:21.596733: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-18 02:54:21.596797: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [3]:
words = np.loadtxt("words100.txt", dtype=str, delimiter=" ", encoding="utf-8")
len(words)

100

In [4]:
class WordsDataset(tf.keras.utils.Sequence):
    padding_char = " "
    n = 100

    def __init__(self, phase, words, batch_size = 100, alphabet = list(string.ascii_lowercase)):
        self.phase = phase
        self.words = copy.deepcopy(words)
        self.batch_size = batch_size
        self.alphabet = alphabet
        self.one_hot_encoding_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            oov_token=self.padding_char,
            char_level=True)
        self.one_hot_encoding_tokenizer.fit_on_texts(self.alphabet)
        self.char_index = {v: k for k, v in self.one_hot_encoding_tokenizer.word_index.items()}
        self.longest_word = max(words, key=len)

    def __len__(self):
        return math.ceil(len(self.words) / self.batch_size)

    def __getitem__(self, idx):
        batch_x = self.words[idx * self.batch_size:(idx + 1) * self.batch_size]
        # x_prim = self.change_random_char(x_prim)
        # x_prim = self.encode(x_prim)
        x_rand = []
        x = []
        for word in batch_x:
            encoded = self.encode(word)
            local_x_rand = []
            x.extend([encoded] * self.n) 
            for i in range(self.n):
                x_rand.append(self.encode(self.change_random_char(word)))
            # x_rand.append(local_x_rand)
            # x.append(local_x)
        return np.asarray(x_rand), np.asarray(x)
        # return np.reshape(x_rand, (-1, (len(self.alphabet) + 1) * len(self.longest_word))), np.reshape(x, (-1, (len(self.alphabet) + 1) * len(self.longest_word)))
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()

    # shuffles the dataset at the end of each epoch
    def on_epoch_end(self):
        random.shuffle(self.words)

    def random_char(self, exclude):
        char = random.choice(self.alphabet)
        return self.random_char(exclude) if char == exclude else char
    
    def change_random_char(self, word):
        idx = random.randint(0, len(word) - 1)
        char = self.random_char(word[idx])
        return word[:idx] + char + word[idx+1:]

    # splits word into list of characters
    def split_word(self, x):
        return list(x)

    # pads 
    def pad(self, x):
        for _ in range(len(self.longest_word) - len(x)):
            x.append(self.padding_char)
        return x
    
    # performs one-hot encoding on x
    def encode(self, x):
        x = self.split_word(x)
        x = self.pad(x)
        x = self.one_hot_encoding_tokenizer.texts_to_matrix(x, mode="binary").reshape(1,-1)[0]
        return x

    def decode(self, x):
        x = x.reshape(len(self.longest_word), len(self.char_index) + 1)
        decoded_x = []
        for y in x:
            decoded_x.append(self.char_index[np.argmax(y)])
        return "".join(decoded_x)

In [5]:
training_generator = WordsDataset("train", words, batch_size=20)
validation_generator = WordsDataset("validation", words, batch_size=20)

In [6]:
longest_word = max(words, key=len)
print(f"Longest word: {longest_word}")
print(f"Longest word length: {len(longest_word)}")

Longest word: destruction
Longest word length: 11


In [7]:
alphabet = list(string.ascii_lowercase)
print(alphabet)
print(len(alphabet))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
26


In [8]:
output_size = (len(training_generator.char_index) + 1) * len(longest_word)
print(f"Input/output size = {output_size}")

latent_dim = 50

class Autoencoder(tf.keras.Model):
    def __init__(self, latent_dim):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(output_size,)),
            tf.keras.layers.Dense(latent_dim, activation="relu")
        ])
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Dense(output_size, activation="sigmoid")
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

autoencoder = Autoencoder(latent_dim)

# autoencoder = tf.keras.models.Sequential(
#     [
#         tf.keras.layers.Input(shape=(input_size,)),
#         tf.keras.layers.Dense(hidden_layer_size, activation="relu"),
#         tf.keras.layers.Dense(input_size, activation="sigmoid")
#     ]
# )

autoencoder.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

Input/output size = 308


2023-01-18 02:54:22.525525: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-18 02:54:22.541133: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-18 02:54:22.541174: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-18 02:54:22.541629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them i

In [9]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

autoencoder.fit(
    training_generator,
    epochs=1000,
    shuffle=True,
    validation_data=validation_generator,
    callbacks=[tensorboard_callback])

Epoch 1/1000


2023-01-18 02:54:24.397784: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-01-18 02:54:25.010852: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f03ec00ebb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-01-18 02:54:25.010929: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Ti, Compute Capability 8.6
2023-01-18 02:54:25.061101: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-01-18 02:54:25.380105: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


<keras.callbacks.History at 0x7f051328bdf0>

In [60]:
enc_fur = training_generator.encode("cay")
print(np.asarray([enc_fur]))
print(enc_fur.shape)
print(type(enc_fur))
prediction = autoencoder.predict(np.asarray([enc_fur]))
print(training_generator.decode(prediction[0]))
# autoencoder.summary()
# autoencoder.predict(training_generator.encode("farts"))

[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
(308,)
<class 'numpy.ndarray'>
day        


In [11]:
for x, y in training_generator:
    # print(type(x[0][0]))
    print(f"x ({len(x)}): ")
    print(f"y ({len(y)}): ")
    # for xx, yy in zip(x, y):
    #     print(f"{training_generator.decode(yy)} -> {training_generator.decode(xx)}")
training_generator.on_epoch_end()

x (2000): 
y (2000): 
x (2000): 
y (2000): 
x (2000): 
y (2000): 
x (2000): 
y (2000): 
x (2000): 
y (2000): 


In [12]:
# char_tokenizer = tf.keras.preprocessing.text.Tokenizer(
#     num_words=len(alphabet),
#     oov_token=" ",
#     char_level=True
# )
# char_tokenizer.fit_on_texts(alphabet)

# sequences = char_tokenizer.texts_to_sequences(alphabet)

# one_hot_chars = char_tokenizer.texts_to_matrix(alphabet, mode="binary")

# char_index = char_tokenizer.word_index
# print('Found %s unique tokens.' % len(char_index))
# print(char_index)

# print(char_tokenizer.texts_to_matrix(words[0], mode="binary"))

In [13]:
# one_hot_encoding_model = tf.keras.models.Sequential(
#     [
#         tf.keras.Input(shape=(1,), dtype=tf.string),
#         tf.keras.layers.TextVectorization(
#             output_mode="multi_hot",
#             vocabulary=alphabet
#         )
#     ]
# )
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])


In [14]:
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])
# print(len(one_hot_encoded[0]))
# print(words[0])
# print(one_hot_encoded[0])

In [15]:
# tf.keras.models.Sequential(
#             [
#                 tf.keras.Input(shape=(1,), dtype=tf.string),
#                 tf.keras.layers.TextVectorization(
#                     output_mode="multi_hot",
#                     vocabulary=self.alphabet)
#             ]
#         )