<a href="https://colab.research.google.com/github/Aayush360/data_science/blob/master/word_level_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##  Making a word level language model (implementation of RNN)

#  [Source:](https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/) 

In [0]:
# import dependencies

import string
from pickle import dump,load
import numpy as np
from random import randint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [0]:
# load document into memory

def load_doc(filename):
  # open the filename in readonly mode
  file = open(filename,'r')
  # read all the content in the file
  text = file.read()
  file.close()
  return text

In [23]:
# load the document
in_filename ='/content/republic.txt'
doc = load_doc(in_filename)
print(doc[:200])


BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what


In [0]:
# cleaning the text

def clean_doc(doc):
  # replace -- with ' '
  doc = doc.replace('--',' ')
  # split into tokens by whitespace
  tokens = doc.split()
  # remove punctuations form each token
  table = str.maketrans('','',string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # make lowercase
  tokens = [ word.lower() for word in tokens]
  return tokens


In [25]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('total tokens are: ', len(tokens))
print('total unique tokens are: ', len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

In [26]:
# organize into sequences of tokens

length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
  # select sequence of tokens
  seq = tokens[i-length:i]
  # convert into a line
  line = ' '.join(seq)
  #store
  sequences.append(line)

print('total sequences is: %d' % len(sequences))

total sequences is: 118633


In [0]:
# save tokens to file, one dialoge per line
def save_doc(lines,filename):
  data ='\n'.join(lines)
  file = open(filename,'w')
  file.write(data)
  file.close()

In [0]:
# save sequences to file
out_filename ='republic_sequences.txt'
save_doc(sequences, out_filename)

### Train Language Model


In [0]:
# load
in_filename = '/content/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')


In [0]:
#integer encode sequences of words

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)


In [0]:
sequences

In [32]:
#vocabulary size
'''The Embedding layer needs to allocate a vector representation for each word in this vocabulary 
  from index 1 to the largest index and because indexing of arrays is zero-offset, 
  the index of the word at the end of the vocabulary will be 7,409; that means the 
  array must be 7,409 + 1 in length.'''

vocab_size = len(tokenizer.word_index)+1
vocab_size

7410

In [0]:
# separate into input and output

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1] # since we have added 1 to vocab size that starts with indexing of 0
y = to_categorical(y, num_classes= vocab_size)
seq_length = X.shape[1] # specify the number of columns i,e 50

## Fit the Model

In [34]:
'''A dense fully connected layer with 
100 neurons connects to the LSTM hidden layers to 
interpret the features extracted from the sequence'''


'A dense fully connected layer with \n100 neurons connects to the LSTM hidden layers to \ninterpret the features extracted from the sequence'

In [35]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            370500    
_________________________________________________________________
lstm (LSTM)                  (None, 50, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 7410)              748410    
Total params: 1,269,810
Trainable params: 1,269,810
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
# compile the model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit model

model.fit(X,y, batch_size=128, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fe220df5160>

In [0]:
# save the model to file

model.save('/content/model.h5')

# save the tokenizer

dump(tokenizer, open('tokenizer.pkl','wb'))

### Use Language Model

In [38]:
'''we can use it to generate new sequences of text that have the same statistical properties as the source text.'''

'we can use it to generate new sequences of text that have the same statistical properties as the source text.'

In [0]:
seq_length = len(lines[0].split()) - 1

### Load the Model

In [48]:
# load the model

model = load_model('model.h5')



In [0]:
# load the tokenizer

tokenizer = load(open('tokenizer.pkl','rb'))

In [54]:
# select a seed

seed_text = lines[randint(0,len(lines))]
print(seed_text+'\n')

any other disease or the knife put to the throat or even the cutting up of the whole body into the minutest pieces can destroy the soul until she herself is proved to become more unholy or unrighteous in consequence of these things being done to the body but that the



In [0]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [62]:

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
other is the goods of life is not the most royal man and not to be the best limit the other and heaviest and again has not the most trustworthy assuredly and i must endeavour to explain i said that the shepherd and creates lies to drink and in the
