# GML - NLP
## NLG

In [None]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 

In [None]:
tokenizer = Tokenizer()
data = "In the town of Athy one Lanigan\nBattered away til he hadnt a pound.\nHis father died and made him a man again\nLeft him a farm and ten acres of ground."
# split the paragraphs (or block) of text into lines (sentences)
corpus = data.lower().split('\n')

corpus


['in the town of athy one lanigan',
 'battered away til he hadnt a pound.',
 'his father died and made him a man again',
 'left him a farm and ten acres of ground.']

For each line in the corpus, we want to generate training data in the form a stream; one word at a time.
Help the model learn which word possibly could appear after a given word or set of words.

Eg. Given the token sequence
>[5, 6, 7, 2, 8, 9, 10]

The resulting training data becomes
>[5,6]\
[5,6,7]\
[5, 6, 7, 2]\
[5, 6, 7, 2, 8]\
[5, 6, 7, 2, 8, 9]\
[5, 6, 7, 2, 8, 9, 10]

The first (n-1) tokens are used as input, then the nth token as output to train the system.

The tells the system, if the input was "in"(5), the next word (output) is "the"(6). And if the input was "in the" the next word is "town". This goes on till the whole sentence is learned

In [None]:
# creating tokenizer
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [None]:
inputSequences = []

for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0] 
  # text_to_sentences accepts a list of sentence(s) 
  # and returns a list of sequence lists
  for i in range(1,len(token_list)):
    nGramSequences = token_list[:i+1]
    inputSequences.append(nGramSequences)

inputSequences

[[5, 6],
 [5, 6, 7],
 [5, 6, 7, 2],
 [5, 6, 7, 2, 8],
 [5, 6, 7, 2, 8, 9],
 [5, 6, 7, 2, 8, 9, 10],
 [11, 12],
 [11, 12, 13],
 [11, 12, 13, 14],
 [11, 12, 13, 14, 15],
 [11, 12, 13, 14, 15, 1],
 [11, 12, 13, 14, 15, 1, 16],
 [17, 18],
 [17, 18, 19],
 [17, 18, 19, 3],
 [17, 18, 19, 3, 20],
 [17, 18, 19, 3, 20, 4],
 [17, 18, 19, 3, 20, 4, 1],
 [17, 18, 19, 3, 20, 4, 1, 21],
 [17, 18, 19, 3, 20, 4, 1, 21, 22],
 [23, 4],
 [23, 4, 1],
 [23, 4, 1, 24],
 [23, 4, 1, 24, 3],
 [23, 4, 1, 24, 3, 25],
 [23, 4, 1, 24, 3, 25, 26],
 [23, 4, 1, 24, 3, 25, 26, 2],
 [23, 4, 1, 24, 3, 25, 26, 2, 27]]

In [None]:
max_sequence_len = max([len(x) for x in inputSequences])
inputSequences = np.array(pad_sequences(inputSequences, maxlen=max_sequence_len, padding="pre"))
inputSequences

array([[ 0,  0,  0,  0,  0,  0,  0,  5,  6],
       [ 0,  0,  0,  0,  0,  0,  5,  6,  7],
       [ 0,  0,  0,  0,  0,  5,  6,  7,  2],
       [ 0,  0,  0,  0,  5,  6,  7,  2,  8],
       [ 0,  0,  0,  5,  6,  7,  2,  8,  9],
       [ 0,  0,  5,  6,  7,  2,  8,  9, 10],
       [ 0,  0,  0,  0,  0,  0,  0, 11, 12],
       [ 0,  0,  0,  0,  0,  0, 11, 12, 13],
       [ 0,  0,  0,  0,  0, 11, 12, 13, 14],
       [ 0,  0,  0,  0, 11, 12, 13, 14, 15],
       [ 0,  0,  0, 11, 12, 13, 14, 15,  1],
       [ 0,  0, 11, 12, 13, 14, 15,  1, 16],
       [ 0,  0,  0,  0,  0,  0,  0, 17, 18],
       [ 0,  0,  0,  0,  0,  0, 17, 18, 19],
       [ 0,  0,  0,  0,  0, 17, 18, 19,  3],
       [ 0,  0,  0,  0, 17, 18, 19,  3, 20],
       [ 0,  0,  0, 17, 18, 19,  3, 20,  4],
       [ 0,  0, 17, 18, 19,  3, 20,  4,  1],
       [ 0, 17, 18, 19,  3, 20,  4,  1, 21],
       [17, 18, 19,  3, 20,  4,  1, 21, 22],
       [ 0,  0,  0,  0,  0,  0,  0, 23,  4],
       [ 0,  0,  0,  0,  0,  0, 23,  4,  1],
       [ 0

In [None]:
x = inputSequences[:,:-1] # all rows, all expect last column
labels = inputSequences[:,-1]  # all rows, last column
y = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
print("Sentence: ",corpus[0])
print("Sequence: ",tokenizer.texts_to_sequences([corpus[0]]))
print("X: ",x[0])
print("Label: ",labels[0])
print("Y: ",y[0])

Sentence:  in the town of athy one lanigan
Sequence:  [[5, 6, 7, 2, 8, 9, 10]]
X:  [0 0 0 0 0 0 0 5]
Label:  6
Y:  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(x, y, epochs=15, verbose=1)
print(model.summary())

Epoch 1/15


  super(Adam, self).__init__(name, **kwargs)


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
<keras.engine.sequential.Sequential object at 0x7fd89964cd50>


In [None]:
seed_text = "a farm he had"
next_words = 10
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = np.argmax(model.predict(token_list), axis=-1)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)

a farm he had a farm and ten acres ground ground ground ground ground
