In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### Importing the Dataset

In [5]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

# List available books
print(gutenberg.fileids())

# Load one book
text = gutenberg.raw('bible-kjv.txt')



[nltk_data] Downloading package gutenberg to /root/nltk_data...


['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data]   Unzipping corpora/gutenberg.zip.


### Removing punctuations

In [6]:
newText = text[:1000]
# print(text)

In [10]:
import re
import string

# Split the text at each verse number like 1:1, 1:2, etc.
parts = re.split(r'(?=\d+:\d+)', text)

# Go through each part and print it if it's not empty
lst1 = []
for part in parts:
    part = part.strip()  # Remove extra spaces or newlines
    if part:  # Only show non-empty parts
        # print(part)
        str1 = ''
        for y in part:
            # print(y)
            if y not in string.punctuation and (ord(y) < 48 or ord(y) > 57):
                if y == '\n':
                    str1 += ' '
                    continue
                str1 += y.lower()
    if str1:
        lst1.append(str1.strip())
    # print(lst1)
lst1[:10]


['the king james bible  the old testament of the king james bible  the first book of moses  called genesis',
 'in the beginning god created the heaven and the earth',
 'and the earth was without form and void and darkness was upon the face of the deep and the spirit of god moved upon the face of the waters',
 'and god said let there be light and there was light',
 'and god saw the light that it was good and god divided the light from the darkness',
 'and god called the light day and the darkness he called night and the evening and the morning were the first day',
 'and god said let there be a firmament in the midst of the waters and let it divide the waters from the waters',
 'and god made the firmament and divided the waters which were under the firmament from the waters which were above the firmament and it was so',
 'and god called the firmament heaven and the evening and the morning were the second day',
 'and god said let the waters under the heaven be gathered together unto one p

### Tokenizer

In [11]:
# 1 - Fit on text
# 2 Tokenizer

from tensorflow.keras.preprocessing.text import Tokenizer

token1 = Tokenizer()
token1.fit_on_texts(lst1)
total_words = len(token1.word_index) + 1

tokenArr = []
for x in lst1:
    token_list = token1.texts_to_sequences([x])[0]
    # print(token_list)
    for y in range(1,len(token_list)):
        # print(token_list[:y+1])
        tokenArr.append(token_list[:y+1])

### Padding the sequences

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    # label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(tokenArr)

### Creating the model

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential

rnn = Sequential()

# Add Input Embedding Layer
rnn.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))

# Add Hidden Layer 1 - LSTM Layer
# rnn.add(LSTM(128, return_sequences=True))
# rnn.add(Dropout(0.1))

rnn.add(LSTM(128))
rnn.add(Dropout(0.1))

# Add Output Layer
rnn.add(Dense(total_words, activation='softmax'))

rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam')





In [None]:
rnn.fit(predictors, label, epochs=50)

Epoch 1/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 7ms/step - loss: 5.5838
Epoch 2/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 7ms/step - loss: 4.6314
Epoch 3/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 7ms/step - loss: 4.3944
Epoch 4/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 7ms/step - loss: 4.2734
Epoch 5/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 7ms/step - loss: 4.1883
Epoch 6/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 7ms/step - loss: 4.1396
Epoch 7/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 7ms/step - loss: 4.1048
Epoch 8/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m205s[0m 7ms/step - loss: 4.0813
Epoch 9/50
[1m23717/23717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 7ms/step - loss: 4.0582
Epoch 10/50
[1m23717/23717[0m [32m

### Text Generation Function

In [14]:
def generateText(seedText, nextWords, maxSequenceLen):
    for _ in range(nextWords):
        tokenList = token1.texts_to_sequences([seedText])[0]
        tokenList = pad_sequences([tokenList], maxlen=maxSequenceLen - 1, padding='pre')
        predicted = rnn.predict(tokenList, verbose=0)
        predictedIndex = np.argmax(predicted)

        outputWord = ''
        for word, index in token1.word_index.items():
            if index == predictedIndex:

                outputWord = word
                break

        seedText += " " + outputWord
    return seedText


### Generate Text Using the Trained Model

In [20]:
print(generateText("I am the", nextWords=10, maxSequenceLen=max_sequence_len))
print(generateText("Our Father", nextWords=10, maxSequenceLen=max_sequence_len))
print(generateText("Thou shall", nextWords=10, maxSequenceLen=max_sequence_len))



I am the lord your god and your fathers and your brethren and
Our Father is not a faithful man of god and we are
Thou shall not be ashamed of thy god and thy fathers shall


### Saving the model

In [21]:
rnn.save('Model1.h5')

