<a href="https://colab.research.google.com/github/DevilNReality/LSTM-Word-Prediction/blob/main/Code%20File/LSTM%20Word%20Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Approach To The Problem

1. Load the libraried and data
2. Clean the data
3. Tokenize the data
4. Convert into sequence
5. Input sequence and Output sequence
6. Create a Sequential Model
7. LSTM Layers
8. Compile the model
9. Fit the model
10. Evaluate the model

# Load the libararies and Data

In [None]:
import numpy as np
from numpy import array
from random import randint
from pickle import load , dump
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , LSTM , Embedding , GRU , Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import string
import urllib

In [None]:
response = urllib.request.urlopen('https://raw.githubusercontent.com/insaid2018/DeepLearning/master/Data/republic_clean.txt')
doc = response.read().decode('utf8')

In [None]:
print(doc[:1000])

﻿BOOK I.

I went down yesterday to the Piraeus with Glaucon the son of Ariston,
that I might offer up my prayers to the goddess (Bendis, the Thracian
Artemis.); and also because I wanted to see in what manner they would
celebrate the festival, which was a new thing. I was delighted with the
procession of the inhabitants; but that of the Thracians was equally,
if not more, beautiful. When we had finished our prayers and viewed the
spectacle, we turned in the direction of the city; and at that instant
Polemarchus the son of Cephalus chanced to catch sight of us from a
distance as we were starting on our way home, and told his servant to
run and bid us wait for him. The servant took hold of me by the cloak
behind, and said: Polemarchus desires you to wait.

I turned round, and asked him where his master was.

There he is, said the youth, coming after you, if you will only wait.

Certainly we will, said Glaucon; and in a few minutes Polemarchus
appear

# Data Cleaning

In [None]:
from re import T
def clean_doc(doc):
  # replace '--' with a space ' '
  doc = doc.replace('--', ' ')
  # split into tokens by white space
  tokens = doc.split()
  # remove punctuation from each token
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  # remove remaining tokens that are not alphabetic
  tokens = [word for word in tokens if word.isalpha()]
  # make lower case
  tokens = [word.lower() for word in tokens]
  return tokens

In [None]:
tokens = clean_doc(doc)

In [None]:
print(tokens[:200])
print('The total tokens:' , len(tokens))
print('Unique Tokens:' , len(set(tokens)))

['book', 'i', 'i', 'went', 'down', 'yesterday', 'to', 'the', 'piraeus', 'with', 'glaucon', 'the', 'son', 'of', 'ariston', 'that', 'i', 'might', 'offer', 'up', 'my', 'prayers', 'to', 'the', 'goddess', 'bendis', 'the', 'thracian', 'artemis', 'and', 'also', 'because', 'i', 'wanted', 'to', 'see', 'in', 'what', 'manner', 'they', 'would', 'celebrate', 'the', 'festival', 'which', 'was', 'a', 'new', 'thing', 'i', 'was', 'delighted', 'with', 'the', 'procession', 'of', 'the', 'inhabitants', 'but', 'that', 'of', 'the', 'thracians', 'was', 'equally', 'if', 'not', 'more', 'beautiful', 'when', 'we', 'had', 'finished', 'our', 'prayers', 'and', 'viewed', 'the', 'spectacle', 'we', 'turned', 'in', 'the', 'direction', 'of', 'the', 'city', 'and', 'at', 'that', 'instant', 'polemarchus', 'the', 'son', 'of', 'cephalus', 'chanced', 'to', 'catch', 'sight', 'of', 'us', 'from', 'a', 'distance', 'as', 'we', 'were', 'starting', 'on', 'our', 'way', 'home', 'and', 'told', 'his', 'servant', 'to', 'run', 'and', 'bid',

# Create Sequences

In [None]:
length = 50 + 1
sequences = list()
for i in range(length , len(tokens)):
  # select sequence of tokens
  seq = tokens[i-length : i]
  # convert into a line
  line = ' '.join(seq)
  # store in sequences list
  sequences.append(line)
print('Total Sequences:' , len(sequences))

Total Sequences: 118633


## Creating a new txt document

In [None]:
# save tokens to file , one dialog per line
def save_doc(lines , filename):
  data = '\n'.join(lines)
  file = open(filename , 'w')
  file.write(data)
  file.close()

# save sequences to file
out_filename = 'republic_sequences.txt'
save_doc(sequences , out_filename)

## Load the document

In [None]:
def load_doc(filename):
  # open the file as read only
  file = open(filename , 'r')
  # read all text
  text = file.read()
  # close the file
  file.close()
  return text

# load
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

# Tokenize and Convert into Sequences

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

## Convert sequences into array

In [None]:
sequences = array(sequences)

In [None]:
sequences

array([[1046,   11,   11, ...,  151,   11,   57],
       [  11,   11, 1045, ...,   11,   57, 1147],
       [  11, 1045,  329, ...,   57, 1147,   35],
       ...,
       [ 382,  467,    4, ...,  414,   13,   21],
       [ 467,    4,   33, ...,   13,   21,   23],
       [   4,   33,   79, ...,   21,   23,   85]])

## Declaring X & Y

In [None]:
x , y = sequences[ : , : -1] , sequences[ : , -1]

In [None]:
x[1]

array([  11,   11, 1045,  329, 7409,    4,    1, 2873,   35,  213,    1,
        261,    3, 2251,    9,   11,  179,  817,  123,   92, 2872,    4,
          1, 2249, 7408,    1, 7407, 7406,    2,   75,  120,   11, 1266,
          4,  110,    6,   30,  168,   16,   49, 7405,    1, 1609,   13,
         57,    8,  549,  151,   11,   57])

In [None]:
y[1]

1147

# LSTM Model

In [None]:
vocal_size = len(tokenizer.word_index) + 1
print(vocal_size)
y = to_categorical(y , num_classes = vocal_size)

7410


In [None]:
model = Sequential()
model.add(Embedding(vocal_size , 100 , input_length = x.shape[1] )) # Embedding or Input Layer
model.add(LSTM(100 , return_sequences = True)) # LSTM 1 Layer
model.add(LSTM(100)) # LSTM 2 Layer
model.add(Dense(100 , activation = 'relu')) # Classification Layer
model.add(Dense(vocal_size , activation = 'softmax')) # Output Layer



In [None]:
model.compile( loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'])

In [None]:
hist = model.fit(x , y , batch_size = 128 , epochs = 100)

Epoch 1/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 17ms/step - accuracy: 0.0623 - loss: 6.4569
Epoch 2/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1084 - loss: 5.6700
Epoch 3/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1333 - loss: 5.4205
Epoch 4/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 16ms/step - accuracy: 0.1495 - loss: 5.2469
Epoch 5/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1582 - loss: 5.1316
Epoch 6/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 15ms/step - accuracy: 0.1624 - loss: 5.0268
Epoch 7/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1712 - loss: 4.9253
Epoch 8/100
[1m927/927[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.1745 - loss: 4.8314
Epoch 9/100
[1m

# Inference Pipeline

In [None]:
#Save the model
model.save('model.h5')
#take a copy of the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))



In [None]:
model = load_model('model.h5')
tokenizer = load(open('tokenizer.pkl', 'rb'))



In [None]:
seq_length = len(lines[0].split())-1


Inference Pipeline
1. Random Text
2. Tokenizer
3. Padding for fixed length
4. Predict
5. Use predicted values in tokenizer to generate the text

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(yhat):
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)


# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print("seed_text:" + '\n')
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print("generated_text:" + '\n')
print(generated)



seed_text:

thousand times better than the inhabitants of the den and you will know what the several images are and what they represent because you have seen the beautiful and just and good in their truth and thus our state which is also yours will be a reality and not a dream

generated_text:

only and is not the passionate element perfect of men to the state and the other of which we lately spoke after be compelled to dwell would you he replied and i have been describing again when he has cured patients like asclepius or left behind him like language and
