In [125]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding  , Bidirectional , LSTM , RNN , Dropout , Input , Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import re
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split

In [51]:
with open("wreath.txt","r") as file:
    data = file.read()
data = data.split("\n")

In [52]:
data[1]

'What authority surfeits on would relieve us: if they'

In [33]:
len(data)

108

In [75]:
def decontract(text):
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"weren\'t" , "were not",text)
    
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [76]:
def clean_text(text):
    for i in range(len(text)):
        text[i] = text[i].strip()
        text[i] = text[i].lower()
        text[i] = re.sub(r"\n"," ",text[i])
        text[i] = re.sub(r'[^\w\s]', '', text[i])
        text[i] = decontract(text[i])
    return text

In [77]:
text = clean_text(data)

In [78]:
len(text)

2514

In [79]:
text

['we are accounted poor citizens the patricians good',
 'what authority surfeits on would relieve us if they',
 'would yield us but the superfluity while it were',
 'wholesome we might guess they relieved us humanely',
 'but they think we are too dear the leanness that',
 'afflicts us the object of our misery is as an',
 'inventory to particularise their abundance our',
 'sufferance is a gain to them let us revenge this with',
 'our pikes ere we become rakes for the gods know i',
 'speak this in hunger for bread not in thirst for revenge',
 '',
 '',
 'i tell you friends most charitable care',
 'have the patricians of you for your wants',
 'your suffering in this dearth you may as well',
 'strike at the heaven with your staves as lift them',
 'against the roman state whose course will on',
 'the way it takes cracking ten thousand curbs',
 'of more strong link asunder than can ever',
 'appear in your impediment for the dearth',
 'the gods not the patricians make it and',
 'your knees to 

In [80]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_index) + 1

In [81]:
vocab_size

3831

In [59]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'my': 6,
 'in': 7,
 'a': 8,
 'that': 9,
 'with': 10,
 'his': 11,
 'for': 12,
 'this': 13,
 'not': 14,
 'you': 15,
 'your': 16,
 'me': 17,
 'thy': 18,
 'be': 19,
 'but': 20,
 'is': 21,
 'it': 22,
 'by': 23,
 'he': 24,
 'have': 25,
 'as': 26,
 'thou': 27,
 'all': 28,
 'our': 29,
 'him': 30,
 'which': 31,
 'if': 32,
 'from': 33,
 'thee': 34,
 'on': 35,
 'so': 36,
 'what': 37,
 'their': 38,
 'we': 39,
 'no': 40,
 'will': 41,
 'now': 42,
 'are': 43,
 'do': 44,
 'shall': 45,
 'king': 46,
 'was': 47,
 'then': 48,
 'they': 49,
 'or': 50,
 'when': 51,
 'her': 52,
 'more': 53,
 'one': 54,
 'blood': 55,
 'than': 56,
 'at': 57,
 'upon': 58,
 'hath': 59,
 'did': 60,
 'death': 61,
 'am': 62,
 'would': 63,
 'let': 64,
 'o': 65,
 'like': 66,
 'yet': 67,
 'us': 68,
 'how': 69,
 'were': 70,
 'them': 71,
 'make': 72,
 'made': 73,
 'here': 74,
 'thus': 75,
 'god': 76,
 'good': 77,
 'these': 78,
 'well': 79,
 'love': 80,
 'had': 81,
 'most': 82,
 'out': 8

In [63]:
input_sequences = []
for line in text:
    print(f"line -----> {line}")
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


line -----> we are accounted poor citizens the patricians good
line -----> what authority surfeits on would relieve us if they
line -----> would yield us but the superfluity while it were
line -----> wholesome we might guess they relieved us humanely
line -----> but they think we are too dear the leanness that
line -----> afflicts us the object of our misery is as an
line -----> inventory to particularise their abundance our
line -----> sufferance is a gain to them let us revenge this with
line -----> our pikes ere we become rakes for the gods know i
line -----> speak this in hunger for bread not in thirst for revenge
line -----> 
line -----> 
line -----> i tell you friends most charitable care
line -----> have the patricians of you for your wants
line -----> your suffering in this dearth you may as well
line -----> strike at the heaven with your staves as lift them
line -----> against the roman state whose course will on
line -----> the way it takes cracking ten thousand curbs
line --

In [82]:
input_sequences

[[39],
 [39, 43],
 [39, 43, 1533],
 [39, 43, 1533, 134],
 [39, 43, 1533, 134, 686],
 [39, 43, 1533, 134, 686, 1],
 [39, 43, 1533, 134, 686, 1, 360],
 [39, 43, 1533, 134, 686, 1, 360, 77],
 [37],
 [37, 1534],
 [37, 1534, 1535],
 [37, 1534, 1535, 35],
 [37, 1534, 1535, 35, 63],
 [37, 1534, 1535, 35, 63, 1536],
 [37, 1534, 1535, 35, 63, 1536, 68],
 [37, 1534, 1535, 35, 63, 1536, 68, 32],
 [37, 1534, 1535, 35, 63, 1536, 68, 32, 49],
 [63],
 [63, 528],
 [63, 528, 68],
 [63, 528, 68, 20],
 [63, 528, 68, 20, 1],
 [63, 528, 68, 20, 1, 1537],
 [63, 528, 68, 20, 1, 1537, 213],
 [63, 528, 68, 20, 1, 1537, 213, 22],
 [63, 528, 68, 20, 1, 1537, 213, 22, 70],
 [1538],
 [1538, 39],
 [1538, 39, 251],
 [1538, 39, 251, 962],
 [1538, 39, 251, 962, 49],
 [1538, 39, 251, 962, 49, 1539],
 [1538, 39, 251, 962, 49, 1539, 68],
 [1538, 39, 251, 962, 49, 1539, 68, 1540],
 [20],
 [20, 49],
 [20, 49, 203],
 [20, 49, 203, 39],
 [20, 49, 203, 39, 43],
 [20, 49, 203, 39, 43, 110],
 [20, 49, 203, 39, 43, 110, 149],
 [

In [85]:
max_seq_length = max(len(x) for x in input_sequences)
max_seq_length

12

In [92]:
padded_sequences = pad_sequences(input_sequences,maxlen=max_seq_length,padding="pre")

In [93]:
padded_sequences

array([[   0,    0,    0, ...,    0,    0,   39],
       [   0,    0,    0, ...,    0,   39,   43],
       [   0,    0,    0, ...,   39,   43, 1533],
       ...,
       [   0,    0,    0, ..., 3829,    7,   13],
       [   0,    0,    0, ...,    7,   13, 3830],
       [   0,    0,    0, ...,   13, 3830,   97]])

In [141]:
X = padded_sequences[:,:-1]
y = padded_sequences[:, 1:]

In [142]:
print(f"X shape --> {X.shape}")
print(f"y shape --> {y.shape}")

X shape --> (18924, 11)
y shape --> (18924, 11)


In [144]:
X

array([[   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,   39],
       [   0,    0,    0, ...,    0,   39,   43],
       ...,
       [   0,    0,    0, ..., 3828, 3829,    7],
       [   0,    0,    0, ..., 3829,    7,   13],
       [   0,    0,    0, ...,    7,   13, 3830]])

In [143]:
y

array([[   0,    0,    0, ...,    0,    0,   39],
       [   0,    0,    0, ...,    0,   39,   43],
       [   0,    0,    0, ...,   39,   43, 1533],
       ...,
       [   0,    0,    0, ..., 3829,    7,   13],
       [   0,    0,    0, ...,    7,   13, 3830],
       [   0,    0,    0, ...,   13, 3830,   97]])

In [145]:
ys = np.array([to_categorical(seq, num_classes=vocab_size) for seq in y])

In [146]:
ys.shape

(18924, 11, 3831)

### Building Model

In [147]:
input_layer = Input(shape=(X.shape[1],))
embedding_layer = Embedding(vocab_size,256)(input_layer)
lstm_layer = Bidirectional(LSTM(128,activation="tanh",return_sequences=True))(embedding_layer)
dropout_layer = Dropout(0.4)(lstm_layer)
output_layer = Dense(vocab_size,activation="softmax")(dropout_layer)

model = Model(inputs=input_layer,outputs=output_layer)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [124]:
model.summary()

In [148]:
X_train , X_test , y_train , y_test = train_test_split(X,ys,test_size=0.2,random_state=42)

In [155]:
history = model.fit(X_train,y_train,epochs=7,batch_size=32,verbose=1)

Epoch 1/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 58ms/step - accuracy: 0.8193 - loss: 1.0520
Epoch 2/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 57ms/step - accuracy: 0.8659 - loss: 0.8023
Epoch 3/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 55ms/step - accuracy: 0.8957 - loss: 0.6370
Epoch 4/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 55ms/step - accuracy: 0.9131 - loss: 0.5366
Epoch 5/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 54ms/step - accuracy: 0.9216 - loss: 0.4640
Epoch 6/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 55ms/step - accuracy: 0.9263 - loss: 0.4215
Epoch 7/7
[1m474/474[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 57ms/step - accuracy: 0.9298 - loss: 0.3871


In [160]:
def prepare_input(text, tokenizer, max_seq_length):
    text = clean_text([text])
    token_list = tokenizer.texts_to_sequences([text])[0]
    
    padded_sequence = pad_sequences([token_list], maxlen=max_seq_length-1, padding="pre")
    return padded_sequence

In [161]:
def predict_next_word(model, padded_input, tokenizer, temperature=1.0):
    # Predict the next word probabilities
    predictions = model.predict(padded_input, verbose=0)
    # print(f"predictions ---> {predictions} , the first element : {predictions[0]} , the first shape :--> {predictions.shape}")
    
    # Apply temperature
    predictions = predictions[0, -1, :]  
    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions + 1e-10) / temperature  
    predictions = np.exp(predictions) / np.sum(np.exp(predictions))  

    # Sample from the probability distribution
    predicted_index = np.random.choice(len(predictions), p=predictions)
    predicted_word = tokenizer.index_word.get(predicted_index, '')
    return predicted_word


In [167]:
def generate_text(model, tokenizer, seed_text, max_seq_length, num_words=50, temperature=1.0):
    generated_text = seed_text
    for _ in range(num_words):
        padded_input = prepare_input(generated_text, tokenizer, max_seq_length)
        predicted_word = predict_next_word(model, padded_input, tokenizer, temperature)
        if not predicted_word:
            break
        generated_text += ' ' + predicted_word
    return generated_text


seed_text = "inventory to particularise"
generated_text = generate_text(model, tokenizer, seed_text, max_seq_length, temperature=1.5)
print(f"Generated text: {generated_text}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23

In [173]:
words = generated_text.split()

for i in range(0, len(words), 7):
    print(' '.join(words[i:i+7]))

inventory to particularise how o was because
advance infusing if now unlikely subscribed favouring
that cleansing my aumerle those turns answer
in smile nor of sometimes what gentlemen
where with advance this limit as my
why ill but thee took we clambering
unto all what give brother framed requickend
in which wave together


In [168]:
model.save("TextGeneration.h5")



In [169]:
model_json = model.to_json()
with open("TextGeneration.json","w") as json_file:
    json_file.write(model_json)

In [170]:
import pickle
with open("tokenizer.pkl","wb") as file :
    pickle.dump(tokenizer,file)