In [1]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

In [2]:
import string
import copy
import random
import datetime

import numpy as np

import tensorflow as tf

np.set_printoptions(threshold=np.inf)

2023-01-16 06:10:51.982134: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-16 06:10:52.070238: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-16 06:10:52.477806: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-16 06:10:52.477912: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [3]:
words = np.loadtxt("words100.txt", dtype=str, delimiter=" ", encoding="utf-8")
len(words)

100

In [4]:
class WordsDataset(tf.keras.utils.Sequence):
    padding_char = " "

    def __init__(self, 
                 phase, 
                 words,
                 alphabet = list(string.ascii_lowercase)):
        self.phase = phase
        self.words = copy.deepcopy(words)
        self.alphabet = alphabet
        self.one_hot_encoding_model = tf.keras.models.Sequential(
            [
                tf.keras.Input(shape=(1,), dtype=tf.string),
                tf.keras.layers.TextVectorization(
                    output_mode="multi_hot",
                    vocabulary=self.alphabet)
            ]
        )
        self.longest_word = max(words, key=len)

    def __len__(self):
        return self.words.shape[0]

    def __getitem__(self, idx):
        x = self.words[idx]
        x = self.encode(x)
        x_prim = self.words[idx]
        x_prim = self.change_random_char(x_prim)
        print(f"word: {self.words[idx]}\nrandom_word: {x_prim}")
        x_prim = self.encode(x_prim)
        return x_prim, x
    
    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
            
            if i == self.__len__()-1:
                self.on_epoch_end()

    # shuffles the dataset at the end of each epoch
    def on_epoch_end(self):
        reidx = random.sample(population = list(range(self.__len__())),k = self.__len__())
        self.words = self.words[reidx]

    def random_char(self, exclude):
        char = random.choice(self.alphabet)
        return self.random_char(exclude) if char == exclude else char
    
    def change_random_char(self, word):
        idx = random.randint(0, len(word) - 1)
        char = self.random_char(word[idx])
        return word[:idx] + char + word[idx+1:]

    # splits word into list of characters
    def split_word(self, x):
        return list(x)

    # pads 
    def pad(self, x):
        for _ in range(len(self.longest_word) - len(x)):
            x.append(self.padding_char)
        return x
    
    # performs one-hot encoding on x
    def encode(self, x):
        x = self.split_word(x)
        x = self.pad(x)
        x = np.array(self.one_hot_encoding_model.predict(x).reshape(-1,))
        return x

In [5]:
training_generator = WordsDataset("train", words)
validation_generator = WordsDataset("validation", words)

2023-01-16 06:10:53.601228: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-16 06:10:53.639219: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-16 06:10:53.639259: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-01-16 06:10:53.639883: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them i

In [6]:
longest_word = max(words, key=len)
print(f"Longest word: {longest_word}")
print(f"Longest word length: {len(longest_word)}")

Longest word: destruction
Longest word length: 11


In [7]:
alphabet = list(string.ascii_lowercase)
print(alphabet)
print(len(alphabet))

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
26


In [27]:
input_size = (len(string.ascii_lowercase) + 1) * len(longest_word)
print(f"Input size = {input_size}")

hidden_layer_size = 20

autoencoder = tf.keras.models.Sequential(
    [
        tf.keras.layers.Input(shape=(input_size,)),
        tf.keras.layers.Dense(hidden_layer_size, activation="relu"),
        tf.keras.layers.Dense(input_size, activation="sigmoid")
    ]
)

autoencoder.compile(
    optimizer=tf.keras.optimizers.experimental.Adadelta(),
    loss="binary_crossentropy",
    metrics=['accuracy']
)

Input size = 297


In [17]:
x_train = []
y_train = []

x_test = []
y_test = []

for i in range(10):
    for x, y in training_generator:
        x_train.append(x)
        y_train.append(y)

for i in range(2):
    for x, y in validation_generator:
        x_test.append(x)
        y_test.append(y)

x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_test = np.asarray(y_test)

word: end
random_word: esd
word: page
random_word: jage
word: after
random_word: afteg
word: board
random_word: doard
word: trade
random_word: tride
word: for
random_word: fjr
word: seed
random_word: sesd
word: bell
random_word: beli
word: down
random_word: ddwn
word: private
random_word: privatj
word: name
random_word: ntme
word: that
random_word: thaf
word: day
random_word: lay
word: wood
random_word: woow
word: cord
random_word: corc
word: linen
random_word: linea
word: drink
random_word: drini
word: coal
random_word: ctal
word: family
random_word: famiuy
word: shelf
random_word: sheyf
word: any
random_word: lny
word: cause
random_word: calse
word: hate
random_word: late
word: fight
random_word: zight
word: hope
random_word: hopl
word: thumb
random_word: thuyb
word: sister
random_word: smster
word: snake
random_word: snaka
word: transport
random_word: lransport
word: map
random_word: uap
word: future
random_word: juture
word: net
random_word: nes
word: circle
random_word: circye
wor

In [21]:
print(len(x_test))
print(len(y_test))

200
200


In [28]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

autoencoder.fit(
    x=x_train,
    y=y_train,
    epochs=20,
    shuffle=True,
    validation_data=(x_test, y_test),
    callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f87bbf55ff0>

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0.]


ValueError: in user code:

    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/engine/training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/engine/training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/engine/training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/engine/training.py", line 2079, in predict_step
        return self(x, training=False)
    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/pxy/dev/nlp-autocorrect/neuralnet-based/venv/lib/python3.10/site-packages/keras/engine/input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_4' (type Sequential).
    
    Input 0 of layer "dense_4" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (None,)
    
    Call arguments received by layer 'sequential_4' (type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=False
      • mask=None


In [None]:
# char_tokenizer = tf.keras.preprocessing.text.Tokenizer(
#     num_words=len(alphabet),
#     oov_token=" ",
#     char_level=True
# )
# char_tokenizer.fit_on_texts(alphabet)

# sequences = char_tokenizer.texts_to_sequences(alphabet)

# one_hot_chars = char_tokenizer.texts_to_matrix(alphabet, mode="binary")

# char_index = char_tokenizer.word_index
# print('Found %s unique tokens.' % len(char_index))
# print(char_index)

# print(char_tokenizer.texts_to_matrix(words[0], mode="binary"))

In [None]:
# one_hot_encoding_model = tf.keras.models.Sequential(
#     [
#         tf.keras.Input(shape=(1,), dtype=tf.string),
#         tf.keras.layers.TextVectorization(
#             output_mode="multi_hot",
#             vocabulary=alphabet
#         )
#     ]
# )
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])


In [None]:
# one_hot_encoded = np.array([one_hot_encoding_model.predict(word).reshape(-1,) for word in padded_chars])
# print(len(one_hot_encoded[0]))
# print(words[0])
# print(one_hot_encoded[0])

In [None]:
# tf.keras.models.Sequential(
#             [
#                 tf.keras.Input(shape=(1,), dtype=tf.string),
#                 tf.keras.layers.TextVectorization(
#                     output_mode="multi_hot",
#                     vocabulary=self.alphabet)
#             ]
#         )