In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load Data
[Dataset](https://www.kaggle.com/datasets/ronikdedhia/next-word-prediction/code)

In [2]:
def read_file(path):
    with open(path) as file:
        text = file.read()
    return text[:5000]

In [3]:
import spacy
import en_core_web_sm
import numpy as np
from keras.preprocessing.text import Tokenizer

In [4]:
nlp = en_core_web_sm.load()

In [5]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\ufeff\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [6]:
data = read_file('data.txt')
tokens = separate_punc(data)
tokens[:10]

['project',
 'gutenberg',
 "'s",
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'arthur']

In [7]:
len(tokens)

912

In [8]:
train_len = 20
text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [9]:
' '.join(text_sequences[0])

"project gutenberg 's the adventures of sherlock holmes by arthur conan doyle this ebook is for the use of anyone"

In [10]:
' '.join(text_sequences[1])

"gutenberg 's the adventures of sherlock holmes by arthur conan doyle this ebook is for the use of anyone anywhere"

In [11]:
' '.join(text_sequences[2])

"'s the adventures of sherlock holmes by arthur conan doyle this ebook is for the use of anyone anywhere at"

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)

In [13]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

445

In [14]:
print(len(sequences[0]))
print(sequences[0])

20
[39, 38, 445, 1, 37, 2, 28, 14, 27, 53, 52, 51, 50, 36, 104, 19, 1, 103, 2, 106]


In [15]:
i=0
for a in tokenizer.index_word:
    print(a,"--->",tokenizer.index_word[a])
    i+=1
    if i==20 : break

1 ---> the
2 ---> of
3 ---> and
4 ---> his
5 ---> a
6 ---> to
7 ---> was
8 ---> in
9 ---> i
10 ---> he
11 ---> with
12 ---> 
   
13 ---> my
14 ---> holmes
15 ---> it
16 ---> own
17 ---> which
18 ---> had
19 ---> for
20 ---> 






In [16]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

39 : project
38 : gutenberg
445 : 's
1 : the
37 : adventures
2 : of
28 : sherlock
14 : holmes
27 : by
53 : arthur
52 : conan
51 : doyle
50 : this
36 : ebook
104 : is
19 : for
1 : the
103 : use
2 : of
106 : anyone


In [17]:
sequences = np.array(sequences)
sequences

array([[ 39,  38, 445, ..., 103,   2, 106],
       [ 38, 445,   1, ...,   2, 106, 107],
       [445,   1,  37, ..., 106, 107,  30],
       ...,
       [  2, 436,   3, ...,   1, 444,   3],
       [436,   3, 437, ..., 444,   3, 105],
       [  3, 437,   5, ...,   3, 105,  29]])

In [18]:
X = sequences[:,:-1]

In [19]:
y = sequences[:,-1]

In [20]:
X.shape

(892, 19)

In [21]:
y.shape

(892,)

In [22]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=vocabulary_size)

In [23]:
y.shape

(892, 445)

In [24]:
seq_len = X.shape[1]
seq_len

19

In [25]:
import keras
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

In [26]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, 30, input_length=seq_len))
    model.add(LSTM(150, return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150, activation='relu'))

    model.add(Dense(vocabulary_size, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.summary()

    return model

In [27]:
model = create_model(vocabulary_size, seq_len)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 30)            13350     
                                                                 
 lstm (LSTM)                 (None, 19, 150)           108600    
                                                                 
 lstm_1 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 445)               67195     
                                                                 
Total params: 392395 (1.50 MB)
Trainable params: 392395 (1.50 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
history=model.fit(X, y, epochs=140,verbose=1,validation_batch_size=.20)

Epoch 1/140
Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch 76/140
Epoch 77/140
Epoch 78

In [29]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [30]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        predict_x=model.predict(pad_encoded)
        pred_word_ind=np.argmax(predict_x,axis=1)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)

    return ' '.join(output_text)

In [44]:
import random
random_pick = random.randint(0,len(text_sequences))
random_pick

837

In [45]:
random_seed_text = text_sequences[random_pick]
print(random_seed_text)

['and', 'was', 'shown', 'up', 'to', 'the', 'chamber', 'which', 'had', 'formerly', 'been', 'in', 'part', 'my', 'own', 'his', 'manner', 'was', 'not', 'effusive']


In [46]:
seed_text = ' '.join(random_seed_text)
seed_text

'and was shown up to the chamber which had formerly been in part my own his manner was not effusive'

In [47]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=10)



'it seldom was but he was glad i think to'

In [48]:
text='Hi, i am sheldon, who are'
print(text)

Hi, i am sheldon


In [51]:
generate_text(model,tokenizer,seq_len,seed_text=text,num_gen_words=2)



'little as'