<a href="https://colab.research.google.com/github/AlexSbb/Text-Generation-with-RNN-and-GAN/blob/main/CS13ColabNotebooks/RNN_Autoencoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import all necessary libraries:

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import time
import math
import random
print('TensorFlow version (should be 2.4.1):  ',tf.__version__)

TensorFlow version (should be 2.4.1):   2.4.1


# Open, read and process the text file
In our case the we use the log file from an Elevator Group Control System simulation - "ElevatorLogFile.txt"

In [None]:
PATH_TO_FILE = "/content/drive/MyDrive/Colab Notebooks/CS13ColabNotebooks/ElevatorLogFile.txt"

# Open and read the whole txt file
def open_and_read_test_file (path :str):
    with open(path) as text_file:
        text = text_file.read()
    print('Number of characters in the log file:', len(text))
    print('Number of words in the log file (with whitespace as a splitter):', len(text.split()))
    return text

file_text = open_and_read_test_file(PATH_TO_FILE)

Number of characters in the log file: 73037
Number of words in the log file (with whitespace as a splitter): 9010


Creating a dictionary from a text file. 
We will use a character-level dictionary. To do this, you need to count the number of unique characters in the text file. Each character is then assigned a unique sequence number. As a result, we will get two dictionaries, one for converting characters to numbers, and the second for reverse conversion.

In [None]:
def dictionary_from_text(text:str):
  set_of_uniq_ch = set(text) # set of uniq characters from text
  print('Number of uniq characters in the log file:', len(set_of_uniq_ch))
  char_to_index = {char:index for index, char in enumerate(set_of_uniq_ch)}
  index_to_char = {index:char for char,index in char_to_index.items()}
  return char_to_index, index_to_char

char_to_index, index_to_char =  dictionary_from_text(file_text)

Number of uniq characters in the log file: 54


In order to make it convenient to use dictionaries, I will create two auxiliary functions, for converting text to numbers and for reverse conversion.

In [None]:
def text_to_numbers(dictionary: dict, text: str):
  numbers = [dictionary[char] for char in text]
  return numbers

def numbers_to_text(dictionary: dict, numbers: list):
  text =""
  for num in numbers:
    # text += str(dictionary[num])
    text += dictionary[num]
  return text

Now we can easily convert our entire text file to numbers

In [None]:
whole_text_as_numbers = text_to_numbers(char_to_index, file_text)

# Test:
first_64_numbers = whole_text_as_numbers[0:64]
print("First 64 numberst from 'whole_text_as_numbers':")
print(first_64_numbers)
# test reverse conversion
reverse_conversion = numbers_to_text(index_to_char, first_64_numbers)
print("Conversin from numbers to text:")
print(reverse_conversion)

First 64 numberst from 'whole_text_as_numbers':
[19, 36, 45, 12, 46, 47, 24, 7, 46, 39, 39, 41, 7, 31, 19, 36, 45, 12, 46, 47, 24, 7, 46, 39, 39, 41, 7, 45, 7, 41, 20, 6, 42, 51, 51, 51, 52, 48, 36, 45, 0, 20, 2, 2, 41, 47, 37, 41, 7, 31, 19, 36, 45, 30, 41, 47, 41, 7, 20, 24, 27, 47, 37, 45]
Conversin from numbers to text:
0: Controller_0: Controller ready...
1: Passenger_0: Generating 


In [None]:
#Split the input sequence into x and y, where x and y have the same shape
def split_to_x_y(sequence: list, x_length: int = 128, step: int = 3):
    x = []
    y = []
    for i in range(0, len(sequence) - x_length, step):
        x.append(np.array(sequence[i: i + x_length]) )
        y.append(np.array(sequence[i+1:i+1 + x_length]))
    print("Number of sequences:", len(x))
    x = np.array(x)
    y = np.array(y)   
    print('x.shape=', x.shape)
    print('y.shape=', y.shape)
    return x , y

# Left one last element in a sequence
def take_last_element(original_y: np.array):
  y_1 = [y[-1] for y in original_y]
  y_1 = np.array(y_1) 
  print('y_1.shape=', y_1.shape)
  return y_1

x, y = split_to_x_y(whole_text_as_numbers, x_length = 32, step = 3)
# y_1 = take_last_element(y)

Number of sequences: 24335
x.shape= (24335, 32)
y.shape= (24335, 32)


# One-hot encoding

In [None]:
dictionary_lenth = len(index_to_char)
one_hot_matrix = np.eye(dictionary_lenth)
print('one_hot_matrix.shape=', one_hot_matrix.shape)

def array_of_seq_to_one_hot(array_of_seq:np.array, matrix: np.array):
  one_hot_seq = np.array(
    [np.array([matrix[:,ind] for ind in seq]) for seq in array_of_seq]
    )
  return one_hot_seq

def array_of_int_to_one_hot(array_of_int:np.array, matrix: np.array):
  one_hot_seq = np.array(
    np.array([matrix[:,ind] for ind in array_of_int])
    )
  return one_hot_seq

one_hot_x = array_of_seq_to_one_hot(x,one_hot_matrix)
one_hot_y = array_of_seq_to_one_hot(y,one_hot_matrix)

# one_hot_y_1 = array_of_int_to_one_hot(y_1,one_hot_matrix)

print('one_hot_x.shape:', one_hot_x.shape, "# (batch_size, sequence_length, vocab_size)")
print('one_hot_y.shape:', one_hot_y.shape)
# print('one_hot_y_1.shape:', one_hot_y_1.shape)

one_hot_matrix.shape= (54, 54)
one_hot_x.shape: (24335, 32, 54) # (batch_size, sequence_length, vocab_size)
one_hot_y.shape: (24335, 32, 54)


# Autoencoder

In [None]:
class Autoencoder(tf.keras.Model):
  def __init__(self,):
    super(Autoencoder, self).__init__() 
    self.encoder = tf.keras.Sequential([                                       
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(16, return_sequences=True),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.LSTM(4, return_sequences=True) 
    ])
    self.decoder = tf.keras.Sequential([
    tf.keras.layers.LSTM(4, return_sequences=True),
    tf.keras.layers.LSTM(16, return_sequences=True),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.Dense(dictionary_lenth, activation="softmax") 

    ])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded

autoencoder = Autoencoder()

In [None]:
autoencoder.compile(optimizer='adam', loss=tf.losses.categorical_crossentropy)
autoencoder.fit(x = one_hot_x,y = one_hot_x, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f3ac01a4d90>

Encode a text to a laten vector 'z'

In [None]:
sample_text = file_text[0:32]
converted_sample_text = text_to_numbers(char_to_index, sample_text)
one_hot_sample_text = np.array([array_of_int_to_one_hot(converted_sample_text, one_hot_matrix)])
encoder_input = one_hot_sample_text
print("encoder_input shape", encoder_input.shape)

encoder_output = autoencoder.encoder(one_hot_sample_text)
print("encoder output shape", encoder_output.shape)

encoder_input shape (1, 32, 54)
encoder output shape (1, 32, 4)


Decode text from the a laten vector 'Z'

In [None]:
decoder_output = autoencoder.decoder(encoder_output)
print('Decoder output shape', decoder_output.shape)

Decoder output shape (1, 32, 54)


In [None]:
pred_array = tf.squeeze(decoder_output).numpy()
temp_texp = '' 
for vec in pred_array:
  ind = np.array(np.argmax(vec), ndmin=1 )
  char = numbers_to_text(index_to_char, ind)
  # print(char)
  temp_texp +=char
print(temp_texp)

0: Controller_0: Controller read
