NEXT WORD PREDICTION USING NEURAL NETWORKS

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.utils import to_categorical, pad_sequences 
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy

import numpy as np
import pandas as pd
import pickle as pk

LOADING DATA SET

In [8]:
df = pd.read_csv(r"C:\Users\kalyanram\OneDrive\Desktop\DATA HUB\archive\generalEnglishText.csv")

In [9]:
df.shape

(195776, 8)

In [10]:
df.columns

Index(['filename', 'text', 'up_votes', 'down_votes', 'age', 'gender', 'accent',
       'duration'],
      dtype='object')

In [11]:
df.head(5)

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,


In [12]:
text = df.text.values # The remaining columns are unnecessary. 

In [13]:
text = text[:5000] # There are 195776 rows so here iam selecting 5000 rows.
text[:5]

array(['learn to recognize omens and follow them the old king had said',
       'everything in the universe evolved he said',
       'you came so that you could learn about your dreams said the old woman',
       'so now i fear nothing because it was those omens that brought you to me',
       'if you start your emails with greetings let me be the first to welcome you to earth'],
      dtype=object)

TOKENIZING THE CORPUS

In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

SAVING THE TOKENIZER OBJECT

In [12]:
with open("tokenized.pkl",mode="wb") as f:
    pk.dump(tokenizer,f)

In [15]:
len(tokenizer.word_index) # Number of unique words

3906

In [16]:
tokenizer.word_index["universe"]

1489

In [17]:
text_sequences = tokenizer.texts_to_sequences(text)

In [18]:
text_sequences[:5]

[[260, 2, 1067, 197, 3, 820, 56, 1, 69, 212, 14, 16],
 [120, 12, 1, 1489, 1068, 6, 16],
 [11, 100, 47, 9, 11, 86, 260, 24, 42, 392, 16, 1, 69, 253],
 [47, 84, 7, 424, 106, 88, 10, 8, 117, 197, 9, 295, 11, 2, 25],
 [53, 11, 640, 42, 1262, 18, 1263, 201, 25, 27, 1, 118, 2, 1264, 11, 2, 296]]

In [19]:
len(text_sequences) # There are 30000 rows of text that converted into text sequences.

5000

LETS BEGIN THE FEATURE BUILDING THAT'S SUITABLE TO NEXT WORD PREDICTION AND MODEL

FOR AN EXAMPLE:

Text in a row     -> My name is kalyan ram

Sequence allotted -> [1] [2] [3] [4]  [5]

PREDICTION SHOULD BE:

     X            Y
     [1][2]       [3]
     [3][4]       [4]
     
     Number of words stamp in x is 2
     
     X            Y
     [1][2][3]     [3]
     [2][3][4]     [5]
     
     Number of words stamp in x is 3
     
     NOTE: The above is just an example but i used another type of technique for some cases
           that depends on user input text sequence.

In [20]:
# This is the method
var = [260, 2, 1067, 197, 3, 820, 56, 1, 69, 212, 14, 16]
def example (max_n_stamps = 4):
    temp = 0

    for i in range(2,len(var)+1):
        if len(var[:i]) > max_n_stamps:
            temp += 1
            print( var[temp:i] ) 
        else:
            print( var[:i] )
example()

[260, 2]
[260, 2, 1067]
[260, 2, 1067, 197]
[2, 1067, 197, 3]
[1067, 197, 3, 820]
[197, 3, 820, 56]
[3, 820, 56, 1]
[820, 56, 1, 69]
[56, 1, 69, 212]
[1, 69, 212, 14]
[69, 212, 14, 16]


BELOW IS FOR EXTRACT AND ADJUST THE SEQUENCES FOR ALL THE ROWS

In [34]:
max_n_stamps = 6
adj_sequence = []
for text_sequence in text_sequences:
    temp = 0 #The temp variable should be zero for every sequence
    
    for i in range(2,len(text_sequence)+1):
        if len(text_sequence[:i]) > max_n_stamps:
            temp += 1
            adj_sequence.append( text_sequence[temp:i] ) 
        else:
            adj_sequence.append( text_sequence[:i] )


In [35]:
len(adj_sequence)

42318

LETS USE PADDING TO GET THE ALL SEQUENCES INTO SAME SIZE

In [36]:
# Here the padding maxlen is max_n_stamps of the sequence that you decided above
# You can increase the maxlen more than the size of max_n_stamps 
# By changing the below variable

increment = 0
max_input_len = max_n_stamps+increment

resulted_sequences = pad_sequences(adj_sequence, maxlen = max_input_len, padding = "pre")

In [37]:
resulted_sequences

array([[   0,    0,    0,    0,  260,    2],
       [   0,    0,    0,  260,    2, 1067],
       [   0,    0,  260,    2, 1067,  197],
       ...,
       [   0,    0,    0,    0,   44,   76],
       [   0,    0,    0,   44,   76,    2],
       [   0,    0,   44,   76,    2,   56]])

In [26]:
print(type(resulted_sequences) , "|",len(resulted_sequences))

<class 'numpy.ndarray'> | 42318


SEPERATING THE X & Y'S DATA

In [27]:
x = resulted_sequences[:, :-1]
y = resulted_sequences[:, -1:]

In [28]:
x[:6]

array([[   0,    0,    0,    0,  260],
       [   0,    0,    0,  260,    2],
       [   0,    0,  260,    2, 1067],
       [   0,  260,    2, 1067,  197],
       [ 260,    2, 1067,  197,    3],
       [   2, 1067,  197,    3,  820]])

In [29]:
y[:5]

array([[   2],
       [1067],
       [ 197],
       [   3],
       [ 820]])

In [30]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size - ", vocab_size)

Vocabulary size -  3907


LETS  BUILD THE MODEL

In [32]:
model = Sequential()
#Here we do -1 just because max_input_len is the value that before seperation of x & y 
model.add(Embedding(vocab_size, 100, input_length = max_input_len-1)) 
model.add(LSTM(150))
model.add(Dense(vocab_size, activation = "softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 5, 100)            390700    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 3907)              589957    
                                                                 
Total params: 1,131,257
Trainable params: 1,131,257
Non-trainable params: 0
_________________________________________________________________


In [33]:
model.compile(
    optimizer = "adam",
    loss = sparse_categorical_crossentropy,
    metrics = ["accuracy"]
)

In [34]:
model.fit(x, y, epochs=10, workers=2, use_multiprocessing=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x153ce0325b0>

In [35]:
model.evaluate(x,y)



[1.316225290298462, 0.7332577109336853]

In [None]:
model.save("nextWordPredictionModel")

 FOR HERE THE TRAINING IS FINISHED. LETS PERFORM SOME TESTS ON THE MODEL

---

In [38]:
import spacy
import numpy as np
nlp = spacy.load("en_core_web_md")

In [44]:
def get_tokens(xtext: str, nlp, max_sequence_len = 5):
    resulted_tokens = []
    text = xtext.strip().lower()
    doc = nlp(text)

    for token in doc:    
        if token.is_alpha == True  and  token.is_oov == False:
            resulted_tokens.append(token.text)
            
    temp_len = len(resulted_tokens)
    if temp_len > 5:
        return resulted_tokens[temp_len - max_sequence_len:]
    
    return resulted_tokens

In [57]:
def get_word_sequence(text: str, nlp = nlp, max_sequence_len = 5, flag = True):
    
    if flag:
        print("Before process - ", text)
    sequence_list = []
    word_index_dict = tokenizer.word_index
    
    #Calling the above function which preprocesses the input text
    tokens_list = get_tokens(text, nlp = nlp, max_sequence_len = max_sequence_len) 
    if flag:
        print("After process  - ", tokens_list)
    
    for token in tokens_list:
        try:
            sequence_list.append(word_index_dict[token])
        except KeyError as k:
            print("ERROR    : This is a small model might be one of the word in provided text is not existed.. ")
            print("THEY ARE : ", k.args)
            return 0
            
    if len(sequence_list) < 5:
        #The function accepts the 2d array as input sequence
        padded_list = pad_sequences([sequence_list], maxlen = max_sequence_len, padding="pre") 
        return padded_list
    
    return np.array([sequence_list]) #import numpy

In [59]:
def predict_Next_Word(word_sequence_to_predict):

    let_list = model.predict(word_sequence_to_predict, verbose=0)
    resulted_index = np.argmax(let_list[0])
    
    index_word_dict = tokenizer.index_word
    return index_word_dict[resulted_index]

NOTE : The model takes last 5 words of your sentence to predit, if any word in last 5 words is not existed in training vocabulary the model raises an error. 

In [63]:
text_from_user = "If you start your"
predict_seq = get_word_sequence(text_from_user, nlp=nlp, flag=False)
try:
    print(predict_Next_Word(predict_seq))
except:
    print("THE CODE BROKED")

emails


THESE ARE SOME RESULTS THAT MY MODEL PROVIDED

TEXT GENERATION PERPETUATELY BY NEXT WORD PREDICTION

In [425]:
def generate_n_words(text: str, limit = 10):
    
    for _ in range(0, limit):
        predict_seq = get_word_sequence(text, nlp=nlp, flag=False)
        result = predict_Next_Word(predict_seq)
        text = text + " " + result
        
        print(text)

In [444]:
generate_n_words("can i make a", limit = 15)

can i make a little
can i make a little crowd
can i make a little crowd of
can i make a little crowd of about
can i make a little crowd of about twenty
can i make a little crowd of about twenty people
can i make a little crowd of about twenty people surrounding
can i make a little crowd of about twenty people surrounding the
can i make a little crowd of about twenty people surrounding the huge
can i make a little crowd of about twenty people surrounding the huge hole
can i make a little crowd of about twenty people surrounding the huge hole had
can i make a little crowd of about twenty people surrounding the huge hole had said
can i make a little crowd of about twenty people surrounding the huge hole had said the
can i make a little crowd of about twenty people surrounding the huge hole had said the monk
can i make a little crowd of about twenty people surrounding the huge hole had said the monk about


------------- ACCOMPLISHED THE TRAINING -------------

---