# GML - NLP
## NLG

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

## Preparing Data: Tiny Shakespeare
**Description:**
40,000 lines of Shakespeare from a variety of Shakespeare's plays.
From: [Tensorflow Datasets - Tiny Shakespeare](https://www.tensorflow.org/datasets/catalog/tiny_shakespeare)

In [None]:
# preparing data
!wget --no-check-certificate \
  https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt \
  -O /tmp/tiny_shakespeare.txt

--2022-07-01 10:49:49--  https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.214.128, 173.194.215.128, 173.194.217.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.214.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘/tmp/tiny_shakespeare.txt’


2022-07-01 10:49:49 (127 MB/s) - ‘/tmp/tiny_shakespeare.txt’ saved [1115394/1115394]



In [None]:
tokenizer = Tokenizer()
data = open('/tmp/tiny_shakespeare.txt').read()
dataList = data.lower().split('\n')
# split the paragraphs (or block) of text into lines (sentences)

dataList[:25]


['first citizen:',
 'before we proceed any further, hear me speak.',
 '',
 'all:',
 'speak, speak.',
 '',
 'first citizen:',
 'you are all resolved rather to die than to famish?',
 '',
 'all:',
 'resolved. resolved.',
 '',
 'first citizen:',
 'first, you know caius marcius is chief enemy to the people.',
 '',
 'all:',
 "we know't, we know't.",
 '',
 'first citizen:',
 "let us kill him, and we'll have corn at our own price.",
 "is't a verdict?",
 '',
 'all:',
 "no more talking on't; let it be done: away, away!",
 '']

In [None]:
corpus = list(filter(lambda data: ':' not in data and data!='', dataList))
# filtering empty lines and speaker names out

print("Total number of lines: ",len(corpus))

corpus = corpus[:1500]
corpus[:25]

Total number of lines:  22500


['before we proceed any further, hear me speak.',
 'speak, speak.',
 'you are all resolved rather to die than to famish?',
 'resolved. resolved.',
 'first, you know caius marcius is chief enemy to the people.',
 "we know't, we know't.",
 "let us kill him, and we'll have corn at our own price.",
 "is't a verdict?",
 'one word, good citizens.',
 'we are accounted poor citizens, the patricians good.',
 'would yield us but the superfluity, while it were',
 'wholesome, we might guess they relieved us humanely;',
 'afflicts us, the object of our misery, is as an',
 'inventory to particularise their abundance; our',
 'sufferance is a gain to them let us revenge this with',
 'speak this in hunger for bread, not in thirst for revenge.',
 'would you proceed especially against caius marcius?',
 'consider you what services he has done for his country?',
 'very well; and could be content to give him good',
 'report fort, but that he pays himself with being proud.',
 'nay, but speak not maliciously.

## Preparing The Model Input
For each line in the corpus, we want to generate training data in the form a stream; one word at a time.
Help the model learn which word possibly could appear after a given word or set of words.

Eg. Given the token sequence
>[5, 6, 7, 2, 8, 9, 10]

The resulting training data becomes
>[5,6]\
[5,6,7]\
[5, 6, 7, 2]\
[5, 6, 7, 2, 8]\
[5, 6, 7, 2, 8, 9]\
[5, 6, 7, 2, 8, 9, 10]

The first (n-1) tokens are used as input, then the nth token as output to train the system.

The tells the system, if the input was "in"(5), the next word (output) is "the"(6). And if the input was "in the" the next word is "town". This goes on till the whole sentence is learned

In [None]:
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1 

total_words

2316

In [None]:
inputSequences = []

for line in corpus: 
  token_list = tokenizer.texts_to_sequences([line])[0] 
  for i in range(1,len(token_list)):
    nGramSequences = token_list[:i+1]
    inputSequences.append(nGramSequences)

inputSequences[:25]

[[121, 17],
 [121, 17, 313],
 [121, 17, 313, 130],
 [121, 17, 313, 130, 200],
 [121, 17, 313, 130, 200, 90],
 [121, 17, 313, 130, 200, 90, 31],
 [121, 17, 313, 130, 200, 90, 31, 91],
 [91, 91],
 [3, 28],
 [3, 28, 35],
 [3, 28, 35, 314],
 [3, 28, 35, 314, 109],
 [3, 28, 35, 314, 109, 2],
 [3, 28, 35, 314, 109, 2, 258],
 [3, 28, 35, 314, 109, 2, 258, 36],
 [3, 28, 35, 314, 109, 2, 258, 36, 2],
 [3, 28, 35, 314, 109, 2, 258, 36, 2, 871],
 [314, 314],
 [153, 3],
 [153, 3, 80],
 [153, 3, 80, 140],
 [153, 3, 80, 140, 40],
 [153, 3, 80, 140, 40, 22],
 [153, 3, 80, 140, 40, 22, 872],
 [153, 3, 80, 140, 40, 22, 872, 217]]

In [None]:
max_sequence_len = max([len(x) for x in inputSequences])
inputSequences = np.array(pad_sequences(inputSequences, maxlen=max_sequence_len, padding="pre"))
inputSequences[:15,4:]

array([[  0,   0,   0,   0,   0,   0,   0, 121,  17],
       [  0,   0,   0,   0,   0,   0, 121,  17, 313],
       [  0,   0,   0,   0,   0, 121,  17, 313, 130],
       [  0,   0,   0,   0, 121,  17, 313, 130, 200],
       [  0,   0,   0, 121,  17, 313, 130, 200,  90],
       [  0,   0, 121,  17, 313, 130, 200,  90,  31],
       [  0, 121,  17, 313, 130, 200,  90,  31,  91],
       [  0,   0,   0,   0,   0,   0,   0,  91,  91],
       [  0,   0,   0,   0,   0,   0,   0,   3,  28],
       [  0,   0,   0,   0,   0,   0,   3,  28,  35],
       [  0,   0,   0,   0,   0,   3,  28,  35, 314],
       [  0,   0,   0,   0,   3,  28,  35, 314, 109],
       [  0,   0,   0,   3,  28,  35, 314, 109,   2],
       [  0,   0,   3,  28,  35, 314, 109,   2, 258],
       [  0,   3,  28,  35, 314, 109,   2, 258,  36]], dtype=int32)

In [None]:
x = inputSequences[:,:-1] # all rows, all expect last column
labels = inputSequences[:,-1]  # all rows, last column
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)
print("Input dataset size: ",x.shape) # N# of train sentences by N# of features 
print("Output dataset size: ",y.shape) # N# of train sentences by N# of words

Input dataset size:  (8978, 12)
Output dataset size:  (8978, 2316)


In [None]:
print("Sentence: ",corpus[0])
print("Sequence: ",tokenizer.texts_to_sequences([corpus[0]]))
print("X: ",x[0])
print("Label: ",labels[0])
print("Y: ",y[0])

Sentence:  before we proceed any further, hear me speak.
Sequence:  [[121, 17, 313, 130, 200, 90, 31, 91]]
X:  [  0   0   0   0   0   0   0   0   0   0   0 121]
Label:  17
Y:  [0. 0. 0. ... 0. 0. 0.]


In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
adam = Adam()
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(x, y, epochs=60, verbose=1)
print(model.summary())

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 100)           231600    
                                                                 
 bidi

In [None]:
seed_text = "shall I compare"
text_size = len(seed_text.split(" "))
line_length = 8
next_words = 60
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = np.argmax(model.predict(token_list), axis=-1) 
 	#perform the sum accross the last axis
	output_word = ""
	for word, index in tokenizer.word_index.items(): 
		#the words are the keys and index are the values
		if index == predicted:
			output_word = word
			break
	if text_size<line_length:
		seed_text += " " + output_word
		text_size=text_size+1
	else:
		seed_text += "\n" + output_word
		text_size = 1
print(seed_text)

shall I compare had children's voices neither they
did together to do it tune suppliants cup
controversy thus he shall to their blood wherein
he show their love of his bed where
on you curse them that say not as
a little of a necessary parts being proud
parts when he drop will not that have
set me his voices host whereof to
