In [2]:
import os
os.chdir("/content/drive/MyDrive/Colab/RNN/Language_model")

In [3]:
import string
import pickle
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model , load_model
from keras.layers import Dense , Embedding , LSTM , Input
from keras.utils import to_categorical

In [4]:
with open('republic.txt' , mode = 'r') as file:
  print(len(file.readlines()))
  file.close()

15802


Data Prepration


Book/Chapter headings (e.g. “BOOK I.”).

British English spelling (e.g. “honoured”)

Lots of punctuation (e.g. “–“, “;–“, “?–“, and more)

Strange names (e.g. “Polemarchus”).

Some long monologues that go on for hundreds of lines.

Some quoted dialog (e.g. ‘…’)

In [5]:
#load doc into memory
def load_doc(file_name):

  #open the file as read only
  file = open(file_name , mode= 'r')

  #read all text
  text = file.read()

  #close the file
  file.close()

  return text

in_filename = 'republic.txt'
doc = load_doc(in_filename)
print(doc[:200])

BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what


Clean Text

Replace ‘–‘ with a white space so we can split words better.

Split words based on white space.

Remove all punctuation from words to reduce the vocabulary size (e.g. ‘What?’ becomes ‘What’).

Remove all words that are not alphabetic to remove standalone punctuation tokens.

Normalize all words to lowercase to reduce the vocabulary size.

In [6]:
#turn a doc into clean tokens
def clean_doc(doc):

  #replace '--' with a space ' '
  doc = doc.replace('--' , ' ')

  #split into tokens by white space
  tokens = doc.split()

  #remove puncutation from each token
  table = str.maketrans('' , '' , string.punctuation)
  tokens = [w.translate(table) for w in tokens]

  #remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]

  #make lower case
  tokens = [word.lower() for word in tokens]

  return tokens

In [7]:
#clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens : %d' % len(tokens))
print('Unique Tokens : %d' % len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

In [8]:
#organize into sequences of tokens
length = 50 + 1 # it gives 50 words and give the 51th for (y)
sequences = list()
for i in range(length , len(tokens)):

  #select sequence of tokens
  seq = tokens[i - length : i]

  #convert into a line
  line = ' '.join(seq)

  #store
  sequences.append(line)
print('Total Sequences : %d' %len(sequences))

Total Sequences : 118633


In [9]:
def save_doc(lines , filename):
  data = '\n'.join(lines)
  file = open(filename , 'w')
  file.write(data)
  file.close()

In [10]:
out_filename= 'republic_sequences.txt'
save_doc(sequences , out_filename)

Train Language Model

It uses a distributed representation for words so that different words with similar meanings will have a similar representation.

It learns the representation at the same time as learning the model.

It learns to predict the probability for the next word using the context of the last 100 words

In [11]:
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [13]:
#vocabulary size
vocab_size = len(tokenizer.word_index) +1
print('vocab sie is : ' , vocab_size)

vocab sie is :  7410


In [14]:
#seperate into input and output
sequences = np.array(sequences)

x_train , y_train = sequences[ : , :-1 ] , sequences[ : , -1]
y_train = to_categorical(y_train , num_classes = vocab_size)

seq_length = x_train.shape[1]

print('sequences length is :' , seq_length)
print('x_train shape :'  , x_train.shape )
print('y_train shape :'  , y_train.shape )

sequences length is : 50
x_train shape : (118633, 50)
y_train shape : (118633, 7410)


In [17]:
#define model
inputs = Input(shape = 50 , name='inputs')
x = inputs
x = Embedding(input_dim = vocab_size , output_dim = 50)(x)
x = LSTM(128 , return_sequences = True , name = 'LSTM')(x)
x = LSTM(128)(x)
x = Dense(128 , activation='relu')(x)
outputs = Dense(vocab_size , activation='softmax' , name = 'outputs')(x)
model = Model(inputs , outputs , name = 'model')
model.summary()
model.compile(loss = 'categorical_crossentropy' ,
              optimizer = 'adam' ,
              metrics = 'acc')


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 50)]              0         
                                                                 
 embedding_1 (Embedding)     (None, 50, 50)            370500    
                                                                 
 LSTM (LSTM)                 (None, 50, 128)           91648     
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 outputs (Dense)             (None, 7410)              955890    
                                                                 
Total params: 1566134 (5.97 MB)
Trainable params: 1566134 (5.

In [None]:
if os.path.exists('model.h5'):
  model = load_model('model.h5')
else:
  model.fit(x_train , y_train , batch_size = 128 , epochs = 200)
  #save model to file
  model.save('model.h5')
  #save the tokenizer
  pickle.dump(tokenizer , open('tokenizer.pkl' , 'wb'))

In [18]:
model.fit(x_train, y_train, batch_size=128, epochs=200)

Epoch 1/200
Epoch 2/200

KeyboardInterrupt: ignored

Use Language Model

In [19]:
# select a seed text
seed_text = lines[np.random.randint(0, len(lines))]
print(seed_text + '\n')

round they passed beneath the throne of necessity and when they had all passed they marched on in a scorching heat to the plain of forgetfulness which was a barren waste destitute of trees and verdure and then towards evening they encamped by the river of unmindfulness whose water no vessel



In [20]:
encoded = np.array(tokenizer.texts_to_sequences([seed_text]))
# predict the probabilities for each word
y_hat = np.argmax(model.predict(encoded[:,:seq_length], verbose=0)[0])
out_word = ''
for word, index in tokenizer.word_index.items():
    if index == y_hat:
        out_word = word
        break
print('predicted word is: ', out_word)

predicted word is:  the


In [21]:
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = np.array(tokenizer.texts_to_sequences([in_text]))
        encoded = pad_sequences(encoded, maxlen=seq_length, truncating='pre')
        y_hat = np.argmax(model.predict(encoded, verbose=0)[0])
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_hat:
                out_word = word
                break
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [22]:
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(seed_text)
print(generated)

round they passed beneath the throne of necessity and when they had all passed they marched on in a scorching heat to the plain of forgetfulness which was a barren waste destitute of trees and verdure and then towards evening they encamped by the river of unmindfulness whose water no vessel
and the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the other of the


Use Pretrained Embedding