<a href="https://colab.research.google.com/github/DorothyKingsley/NLP_NextWordPredictions/blob/master/language_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#load data into memory
def load_data(filename):
    #open the file as read only
    file = open(filename, 'r')
    #read all text
    text = file.read()
    #close file
    file.close()
    return text

In [3]:
#loading republic data
input_filename="republic_clean.txt"
doc = load_data(input_filename)
#sanity check
doc[:20]

'BOOK I. The Republic'

In [0]:
#cleaning data
import string
#cleaning document to clean tokens
def clean_data(data):
    #replace '\r' with ''
    data = data.replace('\r',' ')
    #replace '\n' with ''
    data = data.replace('\n',' ')
    #replace '--' with a space ' '
    data = data.replace('--',' ')
    #split into tokens by white space
    tokens = data.split()
    #remove punctuations from each token
    #most efficient way to remove punctuation higher versions should use str.maketrans('', '', string.punctuation)
    tokens=[w.translate(str.maketrans('', '', string.punctuation)) for w in tokens] 
    #remove non-alphabetic words
    tokens=[word for word in tokens if word.isalpha()]
    #make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

In [5]:
#clean doc
tokens = clean_data(doc)
print('Total tokens %d'% len(tokens))
print('Unique tokens %d'% len(set(tokens)))
tokens[:200]

Total tokens 210245
Unique tokens 10270


['book',
 'i',
 'the',
 'republic',
 'opens',
 'with',
 'a',
 'truly',
 'greek',
 'scene',
 'a',
 'festival',
 'in',
 'honour',
 'of',
 'the',
 'goddess',
 'bendis',
 'which',
 'is',
 'held',
 'in',
 'the',
 'piraeus',
 'to',
 'this',
 'is',
 'added',
 'the',
 'promise',
 'of',
 'an',
 'equestrian',
 'torchrace',
 'in',
 'the',
 'evening',
 'the',
 'whole',
 'work',
 'is',
 'supposed',
 'to',
 'be',
 'recited',
 'by',
 'socrates',
 'on',
 'the',
 'day',
 'after',
 'the',
 'festival',
 'to',
 'a',
 'small',
 'party',
 'consisting',
 'of',
 'critias',
 'timaeus',
 'hermocrates',
 'and',
 'another',
 'this',
 'we',
 'learn',
 'from',
 'the',
 'first',
 'words',
 'of',
 'the',
 'timaeus',
 'when',
 'the',
 'rhetorical',
 'advantage',
 'of',
 'reciting',
 'the',
 'dialogue',
 'has',
 'been',
 'gained',
 'the',
 'attention',
 'is',
 'not',
 'distracted',
 'by',
 'any',
 'reference',
 'to',
 'the',
 'audience',
 'nor',
 'is',
 'the',
 'reader',
 'further',
 'reminded',
 'of',
 'the',
 'extrao

In [6]:
#split the tokens further into sequences
length = 50 + 1 #50 words considered to build a context and predict the next word. 
#consider building another model as an extension which considers self-contained sentences using truncations and padding
sequences = list()
for i in range(length,len(tokens)):
    seq = tokens[i-length:i] #in this scenario 0-50, 1-51, 2-52 and so on. Try to build another model using 0-50, 51-101 etc
    #convert it into a line
    line = ' '.join(seq)
    #store them in the sequences list
    sequences.append(line)
print('Total sequences %d'%len(sequences))

Total sequences 210194


In [7]:
len(sequences[0])

263

In [0]:
#saving the sequences to a file for re-use
def save_doc(lines, filename):
    doc = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(doc)
    file.close()
output_filename='republic_sequences.txt'
save_doc(sequences,output_filename)

In [0]:
#Training our language model
#We are using Neural language model
from pickle import dump
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [0]:
#integer encode sequences of text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequence = tokenizer.texts_to_sequences(sequences)

In [0]:
#vocabulary
vocab_size = len(tokenizer.word_index)+1

In [0]:
#separate into input and output
import numpy as np
sequence = np.asarray(sequence)
x,y = sequence[:,:-1], sequence[:,-1]
y = to_categorical(y, num_classes = vocab_size) #converting numerical single column data into a one-hot encoder
seq_length = x.shape[1] #defining sequence length parameter with the number of columns

In [13]:
print(len(x[0]))
x.shape

50


(210194, 50)

In [14]:
model = Sequential()
model.add(Embedding(vocab_size,50,input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            513550    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 10271)             1037371   
Total params: 1,701,821
Trainable params: 1,701,821
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
#compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#fit model
model.fit(x,y,batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f4df79d4080>

In [0]:
#save the model to a file. This can be used later
model.save('language_model.h5')
dump(tokenizer,open('tokenizer.pkl','wb'))

In [0]:
# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_data(in_filename)
lines = doc.split('\n')

In [0]:
seq_length = len(lines[0].split()) - 1

In [23]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

the origin of the evil is that all men from the beginning heroes poets instructors of youth have always asserted the temporal dispensation the honours and profits of justice had we been taught in early youth the power of justice and injustice inherent in the soul and unseen by any human



Using TensorFlow backend.


In [0]:
encoded = tokenizer.texts_to_sequences([seed_text])[0]

In [25]:
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=0)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [27]:
yhat

array([3912,    6, 4808, 3912, 6152, 4808, 5834,   96, 5294, 7583, 3912,
          1,    1, 5294,    3, 4808, 6152, 4808, 3326, 5294, 3912,  246,
          1, 3912,    1, 3915,    1, 4808, 2013,   10, 4568, 6300, 2223,
       7583, 2880, 6152, 3912,    1, 4808, 2013, 3915, 5294, 4772, 7583,
       3912, 2075, 3915, 4096, 4808,   96, 3429])

In [0]:
out_word = ''
for word, index in tokenizer.word_index.items():
  if index == yhat.any():
		  out_word = word
		  break

In [0]:
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

In [0]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [32]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

nature be true he said and the other of the soul is the same and the other and the unjust is the best of the soul and the other of the soul in the state and the other of the soul in the state and the other of the soul
