In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv('/content/1661-0.txt', delimiter='\t',header=None, names=["Texts"])

In [9]:
df.head()

Unnamed: 0,Texts
0,Project Gutenberg's The Adventures of Sherlock...
1,This eBook is for the use of anyone anywhere a...
2,almost no restrictions whatsoever. You may co...
3,re-use it under the terms of the Project Guten...
4,with this eBook or online at www.gutenberg.net


In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Texts'])

In [11]:
len(tokenizer.word_index)

8930

In [12]:
tokenized_sentence = tokenizer.texts_to_sequences(df['Texts'])

In [13]:
tokenized_sentence

[[145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2, 18],
 [572, 51, 3398, 3399, 13, 75, 817, 10, 213, 10, 124, 63],
 [2238, 275, 10, 262, 1, 480, 4, 1, 145, 130, 655, 2239],
 [18, 30, 1021, 63, 2240, 21, 1093, 130, 3400],
 [2666, 1, 1020, 4, 128, 34],
 [4790, 611, 2235, 2236],
 [4791, 1022, 4792, 4793, 4794, 1021, 2241],
 [139, 3401, 75, 3402, 4795],
 [3403, 1094],
 [573, 243, 4796, 4797, 1469],
 [722, 4, 30, 145, 130, 1021, 1, 1020, 4, 128, 34],
 [1928, 45, 50, 3404, 145, 130, 2667, 2, 3405, 3406],
 [1677],
 [1, 1020, 4, 128, 34],
 [45, 611, 2235, 2236],
 [1929],
 [6, 5, 885, 8, 940],
 [2668, 1, 248, 481, 691],
 [2669, 5, 113, 4, 2242],
 [3407, 1, 774, 1470, 522],
 [3408, 1, 312, 1023, 941],
 [3409, 1, 58, 18, 1, 1095, 942],
 [3410, 1, 612, 4, 1, 448, 1471],
 [3411, 1, 612, 4, 1, 1930, 886],
 [3412, 1, 612, 4, 1, 3413, 692],
 [3414, 1, 612, 4, 1, 818, 1340],
 [3415, 1, 612, 4, 1, 2243, 406],
 [3416, 1, 612, 4, 1, 887, 943]

In [14]:
input_sequence = []
for j in range(0,len(tokenized_sentence)):
    for i in range(1,len(tokenized_sentence[j])):
      input_sequence.append(tokenized_sentence[j][:i+1])

In [15]:
input_sequence

[[145, 4789],
 [145, 4789, 1],
 [145, 4789, 1, 1020],
 [145, 4789, 1, 1020, 4],
 [145, 4789, 1, 1020, 4, 128],
 [145, 4789, 1, 1020, 4, 128, 34],
 [145, 4789, 1, 1020, 4, 128, 34, 45],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236],
 [30, 1021],
 [30, 1021, 15],
 [30, 1021, 15, 23],
 [30, 1021, 15, 23, 1],
 [30, 1021, 15, 23, 1, 275],
 [30, 1021, 15, 23, 1, 275, 4],
 [30, 1021, 15, 23, 1, 275, 4, 394],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2, 18],
 [572, 51],
 [572, 51, 3398],
 [572, 51, 3398, 3399],
 [572, 51, 3398, 3399, 13],
 [572, 51, 3398, 3399, 13, 75],
 [572, 51, 3398, 3399, 13, 75, 817],
 [572, 51, 3398, 33

In [16]:
max_len=max([len(x) for x in input_sequence])
max_len

20

In [17]:
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='pre')

In [18]:
padded_input_sequence

array([[   0,    0,    0, ...,    0,  145, 4789],
       [   0,    0,    0, ...,  145, 4789,    1],
       [   0,    0,    0, ..., 4789,    1, 1020],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]], dtype=int32)

In [19]:
x = padded_input_sequence[:,:-1]
y = padded_input_sequence[:,-1]

In [20]:
x

array([[   0,    0,    0, ...,    0,    0,  145],
       [   0,    0,    0, ...,    0,  145, 4789],
       [   0,    0,    0, ...,  145, 4789,    1],
       ...,
       [   0,    0,    0, ..., 8930,    3,  360],
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358]], dtype=int32)

In [21]:
y

array([4789,    1, 1020, ...,   83,  358, 1673], dtype=int32)

In [22]:
x.shape

(101619, 19)

In [23]:
y.shape

(101619,)

In [24]:
len(tokenizer.word_index)+1

8931

In [25]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [26]:
y.shape

(101619, 8931)

In [27]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1,100,input_length=max_len-1))
model.add(LSTM(150))
model.add(Dense(len(tokenizer.word_index)+1, activation='softmax'))



In [28]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
history = model.fit(x,y,epochs=35,verbose=1)

Epoch 1/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 74ms/step - accuracy: 0.0598 - loss: 6.5856
Epoch 2/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 75ms/step - accuracy: 0.1197 - loss: 5.5563
Epoch 3/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 70ms/step - accuracy: 0.1524 - loss: 5.0771
Epoch 4/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 70ms/step - accuracy: 0.1742 - loss: 4.7057
Epoch 5/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 70ms/step - accuracy: 0.1953 - loss: 4.3651
Epoch 6/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 69ms/step - accuracy: 0.2163 - loss: 4.0583
Epoch 7/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 71ms/step - accuracy: 0.2517 - loss: 3.7525
Epoch 8/35
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 72ms/step - accuracy: 0.2880 - loss: 3.4685


In [33]:
text = "The"
for i in range(10):
  token_text = tokenizer.texts_to_sequences([text])
  padded_token_text = pad_sequences(token_text, maxlen=max_len-1, padding='pre')
  prob = np.argmax(model.predict(padded_token_text))
  for word,index in tokenizer.word_index.items():
    if index==prob:
      text = text+" "+word
      print(text)
      break

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
The man
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
The man who
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
The man who entered
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
The man who entered was
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
The man who entered was a
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
The man who entered was a sturdy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
The man who entered was a sturdy middle
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
The man who entered was a sturdy middle sized
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
The man who entered was a sturdy middle sized fellow
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
The man who entere

In [34]:
# Saving the LSTM model
model.save('next_word_lstm.h5')

# Saving the tokenizer
import pickle

# Assuming 'tokenizer' is your tokenizer object
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


