In [1]:
## data preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [2]:
# load the dataset

In [3]:
with open("Sherlock Holmes.txt","r") as file:
    text = file.read().lower()

# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words  = len(tokenizer.word_index) +1
total_words

7507

In [4]:
# creating the index for the words
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'of': 4,
 'to': 5,
 'a': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'he': 10,
 'you': 11,
 'was': 12,
 'his': 13,
 'is': 14,
 'my': 15,
 'have': 16,
 'as': 17,
 'had': 18,
 'with': 19,
 'which': 20,
 'at': 21,
 'for': 22,
 'but': 23,
 'not': 24,
 'be': 25,
 'me': 26,
 'we': 27,
 'from': 28,
 'there': 29,
 'upon': 30,
 'this': 31,
 'said': 32,
 'holmes': 33,
 'so': 34,
 'him': 35,
 'her': 36,
 'very': 37,
 'she': 38,
 'been': 39,
 'on': 40,
 'all': 41,
 'no': 42,
 'one': 43,
 'then': 44,
 'your': 45,
 'what': 46,
 'were': 47,
 'by': 48,
 'are': 49,
 'an': 50,
 "'": 51,
 'out': 52,
 'when': 53,
 'would': 54,
 'up': 55,
 'man': 56,
 'has': 57,
 'into': 58,
 'some': 59,
 'will': 60,
 'do': 61,
 'could': 62,
 'little': 63,
 'who': 64,
 'mr': 65,
 'if': 66,
 'see': 67,
 'now': 68,
 'down': 69,
 'our': 70,
 'well': 71,
 'they': 72,
 'or': 73,
 'us': 74,
 'over': 75,
 'may': 76,
 'should': 77,
 'am': 78,
 'know': 79,
 'before': 80,
 'more': 81,
 'come': 82,
 'about': 8

In [5]:
# create input sequence
input_sequences = []
for line in text.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [6]:
input_sequences

[[1, 1534],
 [1, 1534, 4],
 [1, 1534, 4, 118],
 [1, 1534, 4, 118, 33],
 [3994, 3995],
 [3994, 3995, 3996],
 [228, 4],
 [228, 4, 1535],
 [6, 1038],
 [6, 1038, 7],
 [6, 1038, 7, 798],
 [1, 212],
 [1, 212, 381],
 [1, 212, 381, 542],
 [6, 103],
 [6, 103, 4],
 [6, 103, 4, 1797],
 [1, 579],
 [1, 579, 1161],
 [1, 579, 1161, 483],
 [1, 287],
 [1, 287, 799],
 [1, 287, 799, 730],
 [1, 56],
 [1, 56, 19],
 [1, 56, 19, 1],
 [1, 56, 19, 1, 1162],
 [1, 56, 19, 1, 1162, 800],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 457],
 [1, 514, 4, 1, 457, 1324],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 1536],
 [1, 514, 4, 1, 1536, 650],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 2805],
 [1, 514, 4, 1, 2805, 580],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 801],
 [1, 514, 4, 1, 801, 1163],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 2184],
 [1, 514, 4, 1, 2184, 2806],
 [1, 514],
 [1, 514, 4],
 [1, 514, 4, 1],
 [1, 514, 4, 1, 2807],
 [1, 5

In [7]:
## Pad Sequences
max_sequences_len = max([len(x) for x in input_sequences])
max_sequences_len

17

In [8]:
input_sequences =np.array(pad_sequences(input_sequences,maxlen = max_sequences_len,padding = "pre"))
input_sequences

array([[   0,    0,    0, ...,    0,    1, 1534],
       [   0,    0,    0, ...,    1, 1534,    4],
       [   0,    0,    0, ..., 1534,    4,  118],
       ...,
       [   0,    0,    0, ..., 1199,   18,  476],
       [   0,    0,    0, ...,   18,  476,   15],
       [   0,    0,    0, ...,  476,   15,  383]])

In [9]:
# create predictor and label
import tensorflow as tf
x,y = input_sequences[:,:-1],input_sequences[:,-1]


In [10]:
x

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1, 1534],
       [   0,    0,    0, ...,    1, 1534,    4],
       ...,
       [   0,    0,    0, ..., 3598, 1199,   18],
       [   0,    0,    0, ..., 1199,   18,  476],
       [   0,    0,    0, ...,   18,  476,   15]])

In [11]:
y

array([1534,    4,  118, ...,  476,   15,  383])

In [12]:
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [13]:
y # where the index present it represent as 1 remaning all represent with 0 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# split the data into training and testing sets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [16]:
x_train.shape,y_train.shape

((62602, 16), (62602, 7507))

In [17]:
# training the data 

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [19]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = 8200,output_dim = 17), 
    tf.keras.layers.LSTM(units = 200),
    tf.keras.layers.Dense(total_words, activation = tf.nn.softmax)
])
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' , metrics = ['Accuracy'] )

In [20]:
import numpy as np 
model(np.array(x_train))
model.summary()

In [21]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [22]:
history = model.fit(x_train, y_train, epochs=100,
                    batch_size=128, callbacks=[early_stopping], verbose=1)

Epoch 1/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 74ms/step - Accuracy: 0.0567 - loss: 6.9073
Epoch 2/100
[1m  1/490[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:08[0m 140ms/step - Accuracy: 0.0703 - loss: 6.1029

  current = self.get_monitor_value(logs)


[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 69ms/step - Accuracy: 0.0660 - loss: 6.1990
Epoch 3/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 57ms/step - Accuracy: 0.0730 - loss: 6.0071
Epoch 4/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 57ms/step - Accuracy: 0.0763 - loss: 5.8747
Epoch 5/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 57ms/step - Accuracy: 0.0873 - loss: 5.7153
Epoch 6/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 57ms/step - Accuracy: 0.1056 - loss: 5.5387
Epoch 7/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 59ms/step - Accuracy: 0.1189 - loss: 5.3715
Epoch 8/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 73ms/step - Accuracy: 0.1254 - loss: 5.2172
Epoch 9/100
[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 76ms/step - Accuracy: 0.1327 - loss: 5.0902
Epoch 10/100
[1m490/490[0m

In [26]:
model.save("trained_model.h5")



## prediction of words

In [None]:

# def predict_next_word(model,tokenizer,text,max_sequences_len):
#     token_list  = tokenizer.texts_to_sequences([text][0])
#     if len(token_list) >= max_sequences_len:
#         token_list = token_list[-(max_sequences_len-1):]
#     token_list = pad_sequences([token_list],maxlen= max_sequences_len-1,padding="pre")
#     predicted = model.predict(token_list,verbose=0)
#     predicted_word_index = np.argmax(predicted,axis=1)
#     for word,index in tokenizer.word_index.items():
#         if index == predicted_word_index:
#             return word



In [33]:
def predict_next_word(model, tokenizer, input_text, max_sequences_len):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    
    # Adjust token_list length if needed
    if len(token_list) >= max_sequences_len:
        token_list = token_list[-(max_sequences_len - 1):]
    elif len(token_list) < max_sequences_len - 1:
        token_list = [0] * (max_sequences_len - 1 - len(token_list)) + token_list  # Pad manually if needed
    
    # Convert to a padded sequence
    token_list = pad_sequences([token_list], maxlen=max_sequences_len - 1, padding="pre")
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    
    # Convert prediction to word
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    return predicted_word


In [None]:
input_text = "i want to be able to "
print(f"Input text:{input_text}")
max_sequences_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequences_len)
print(f"Next Word prediction :{next_word}")

Input text:i want to be able to  
Next Word prediction :look
