In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Character Level Text Generation with an LSTM Model

A Language Model can be trained to generate text character-by-character. In this scenario, each of the input and output tokens is a character. Moreover, Language Model outputs a conditional probability distribution over character set.

##TensorFlow Pipeline

In [4]:
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras import layers, Model
import os
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [5]:
def save_dataset(dataset,fileName):
  path = os.path.join('./tfDatasets/', fileName)
  tf.data.experimental.save(dataset, path)

def load_dataset(fileName):
  path = os.path.join("./tfDatasets/", fileName)
  new_dataset = tf.data.experimental.load(path,
      tf.TensorSpec(shape=(), dtype=tf.string))
  return new_dataset

In [6]:
batch_size = 64
raw_data_ds = tf.data.TextLineDataset(["/content/drive/My Drive/Colab Notebooks/NLP/republic_clean.txt"])

In [7]:
!pwd

/content


In [8]:
## converted into text
text=""
for elem in raw_data_ds:
   text=text+(elem.numpy().decode('utf-8'))

In [9]:

chars = sorted(list(set(text)))
print("Total disctinct chars:", len(chars))

Total disctinct chars: 79


In [10]:
# cutting the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 2
input_chars = []
next_char = []

In [11]:
for i in range(0, len(text) - maxlen, step):
    input_chars.append(text[i : i + maxlen])
    next_char.append(text[i + maxlen])

In [12]:
print("Number of sequences:", len(input_chars))
print("input X  (input_chars)  --->   output y (next_char) ")

for i in range(5):
  print( input_chars[i],"   --->  ", next_char[i])



Number of sequences: 585326
input X  (input_chars)  --->   output y (next_char) 
ï»¿INTRODUCTION AND AN    --->   A
NTRODUCTION AND ANAL    --->   Y
RODUCTION AND ANALYS    --->   I
DUCTION AND ANALYSIS    --->   .
CTION AND ANALYSIS.T    --->   h


In [13]:
X_train_ds_raw=tf.data.Dataset.from_tensor_slices(input_chars)
y_train_ds_raw=tf.data.Dataset.from_tensor_slices(next_char)

In [14]:
print(X_train_ds_raw)

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>


In [15]:
def custom_standardization(input_data):
    lowercase     = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    stripped_num  = tf.strings.regex_replace(stripped_html, "[\d-]", " ")
    stripped_punc  =tf.strings.regex_replace(stripped_num, 
                             "[%s]" % re.escape(string.punctuation), "")    
    return stripped_punc

def char_split(input_data):
  return tf.strings.unicode_split(input_data, 'UTF-8')

def word_split(input_data):
  return tf.strings.split(input_data)

In [16]:
# Model constants.
max_features = 96           # Number of distinct chars / words  
embedding_dim = 16             # Embedding layer output dimension
sequence_length = maxlen       # Input sequence size

In [17]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    split=char_split, # word_split or char_split
    output_mode="int",
    output_sequence_length=sequence_length,
)

In [18]:
vectorize_layer.adapt(X_train_ds_raw.batch(batch_size))

In [19]:
print("The size of the vocabulary (number of distinct characters): ", len(vectorize_layer.get_vocabulary()))

The size of the vocabulary (number of distinct characters):  30


In [20]:
def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

In [21]:
# Vectorize the data.
X_train_ds = X_train_ds_raw.map(vectorize_text)
y_train_ds = y_train_ds_raw.map(vectorize_text)

X_train_ds.element_spec, y_train_ds.element_spec

(TensorSpec(shape=(20,), dtype=tf.int64, name=None),
 TensorSpec(shape=(20,), dtype=tf.int64, name=None))

In [22]:
y_train_ds=y_train_ds.map(lambda x: x[0])

In [23]:
train_ds =  tf.data.Dataset.zip((X_train_ds,y_train_ds))

In [24]:
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.shuffle(buffer_size=512).batch(batch_size, drop_remainder=True).cache().prefetch(buffer_size=AUTOTUNE)

##Result of the Data Pipeline:

In [25]:
for sample in train_ds.take(1):
  print("input (X) dimension: ", sample[0].numpy().shape, "\noutput (y) dimension: ",sample[1].numpy().shape)

input (X) dimension:  (64, 20) 
output (y) dimension:  (64,)


In [26]:
for sample in train_ds.take(1):
  print("input (sequence of chars): ", sample[0][0].numpy(), "\noutput (next char to complete the input): ",sample[1][0].numpy())

input (sequence of chars):  [ 9  3  2 12  5 18 10  2  5 10  2 18  6 11 24 10  2  6 17  0] 
output (next char to complete the input):  2


In [27]:
for sample in train_ds.take(2):
  print("input (sequence of chars): ", decode_sequence (sample[0][0].numpy()), "\noutput (next char to complete the input): ",vectorize_layer.get_vocabulary()[sample[1][0].numpy()])

NameError: ignored

##PREPARE SAMPLING METHODS

In Text Generation, **sampling** means randomly **picking** the next token according to the generated **conditional probability distribution**.

That is, after generating the conditional  probability distribution over the set of tokens (*vocabulary*) for the given input sequence, we need to  carefully decide how to **select the next token** (***sample***) from this distribution. 



There are **several methods for sampling** in text generation (see [here](https://towardsdatascience.com/how-to-sample-from-language-models-682bceb97277) and [here](https://huggingface.co/blog/how-to-generate)):


* **Greedy Search (Maximization)** 


* **Temperature Sampling**

* **Top-K Sampling**
* **Top-P Sampling (Nucleus sampling)**

* **Beam Search**




In [28]:
def softmax(z):
   return np.exp(z)/sum(np.exp(z))

In [29]:
def greedy_search(conditional_probability):
  return (np.argmax(conditional_probability))

In [None]:
def top_k_sampling(conditional_probability, k):
  top_k_probabilities, top_k_indices= tf.math.top_k(conditional_probability, k=k, sorted=True)
  top_k_probabilities= np.asarray(top_k_probabilities).astype("float32")
  top_k_probabilities= np.squeeze(top_k_probabilities)
  top_k_indices = np.asarray(top_k_indices).astype("int32")
  top_k_redistributed_probability=softmax(top_k_probabilities)
  top_k_redistributed_probability = np.asarray(top_k_redistributed_probability).astype("float32")
  sampled_token = np.random.choice(np.squeeze(top_k_indices), p=top_k_redistributed_probability)
  return sampled_token

##A LSTM-BASED LANGUAGE MODEL FOR TEXT GENERATION

 the length of the input (X) sequence (sequence_length) is 20 tokens (chars).

Adding a layer to map those vocab indices into a space of dimensionality 'embedding_dim'.

After applying Dropout, we use an LSTM layer to process the sequence and learn to generate the next token by the help of a Dense layer.

In [30]:
inputs = tf.keras.Input(shape=(sequence_length), dtype="int64")
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.Flatten()(x)
predictions=  layers.Dense(max_features, activation='softmax')(x)
model_LSTM = tf.keras.Model(inputs, predictions,name="model_LSTM")

Compile the model

In [31]:
model_LSTM.compile(loss='sparse_categorical_crossentropy', 
                   optimizer='adam', metrics=['accuracy'])
print(model_LSTM.summary())

Model: "model_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20)]              0         
                                                                 
 embedding (Embedding)       (None, 20, 16)            1536      
                                                                 
 dropout (Dropout)           (None, 20, 16)            0         
                                                                 
 lstm (LSTM)                 (None, 20, 128)           74240     
                                                                 
 flatten (Flatten)           (None, 2560)              0         
                                                                 
 dense (Dense)               (None, 96)                245856    
                                                                 
Total params: 321,632
Trainable params: 321,632
Non-trai

Train the model

In [None]:
model_LSTM.fit(train_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe472487150>

Below function to convert the given token index to the corresponding character for each token in the generated text

In [32]:
def decode_sequence (encoded_sequence):
  deceoded_sequence=[]
  for token in encoded_sequence:
    deceoded_sequence.append(vectorize_layer.get_vocabulary()[token])
  sequence= ''.join(deceoded_sequence)
  print("\t",sequence)
  return sequence


To generate text with various sampling methods, we prepared the following function. The generate_text(model, prompt, step) function takes the trained Language Model, the prompt, and the length of the text to be generated as the parameters. Then, it generates text with two different sampling methods.

In [33]:
def generate_text(model, seed_original, step):
    seed= vectorize_text(seed_original)
    print("The prompt is")
    decode_sequence(seed.numpy().squeeze())
    

    seed= vectorize_text(seed_original).numpy().reshape(1,-1)
    #Text Generated by Greedy Search Sampling
    generated_greedy_search = (seed)
    for i in range(step):
      predictions=model.predict(seed)
      next_index= greedy_search(predictions.squeeze())
      generated_greedy_search = np.append(generated_greedy_search, next_index)
      seed= generated_greedy_search[-sequence_length:].reshape(1,sequence_length)
    print("Text Generated by Greedy Search Sampling:")
    decode_sequence(generated_greedy_search)

    #Text Generated by Top-K Sampling
    print("Text Generated by Top-K Sampling:")
    for k in [2, 3, 4, 5]:
        print("\tTop-k: ", k)
        seed= vectorize_text(seed_original).numpy().reshape(1,-1)
        generated_top_k = (seed)
        for i in range(step):
            predictions=model.predict(seed)
            next_index = top_k_sampling(predictions.squeeze(), k)
            generated_top_k = np.append(generated_top_k, next_index)
            seed= generated_top_k[-sequence_length:].reshape(1,sequence_length)
        decode_sequence(generated_top_k)




We can call the generate_text() function by providing the trained LM, a prompt and the sequence length of the text to be generated as below.

You can run this method for multiple times to observe the generated text with different sampling methods.

In [34]:
generate_text(model_LSTM,"The Republic of Plato", 20)

The prompt is
	 the republic of plat
Text Generated by Greedy Search Sampling:


IndexError: ignored