In [8]:
import os
from bs4 import BeautifulSoup
import string
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.models import load_model
from keras.layers.core import Dense, Activation
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from keras.utils import to_categorical
import pickle
from keras.optimizers import RMSprop
import numpy as np
from numpy import array
import heapq

**Import data**

In [9]:
documents = []
for file in os.listdir("sample_data/"): # original: "reuters_data/"
    if file.endswith('.sgm'): # it is important for GoogleColab
        filename = os.path.join("sample_data", file) # original: "reuters_data"
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read()
        
        soup = BeautifulSoup(dataFile, 'html.parser')
        contents = soup.findAll('title')
        
        for content in contents:
            documents.append(content.text)

In [10]:
print('Number of documents: {}'.format(len(documents)))

Number of documents: 20841


In [11]:
documents[:10]

['GANTOS INC <GTOS> 4TH QTR JAN 31 NET',
 'CHEMLAWN CORP, ECHOLAB INC SIGN DEFINITIVE MERGER AGREEMENT\n',
 'LDC FOOD AID NEEDS DECLINE IN 1986/87 - USDA',
 'U.S. SUGAR PROGRAM CUT SENT TO CONGRESS BY USDA',
 '<OE INC> 4TH QTR NET',
 'TEXAS INSTRUMENTS <TXN> BEGINS BUILDING PLANT',
 'VMS MORTGAGE LP <VMLPZ> MONTHLY CASH PAYOUT',
 'LITTLE EFFECT SEEN FROM COLD STORAGE REPORT',
 'STRIKE THREAT, LOWER TRAFFIC MAR SEAWAY OPENING',
 'CHEMLAWN <CHEM>, ECOLAB <ECON> IN MERGER PACT']

**Join the documents**

In [12]:
data = ""
for d in documents:
    data += d

In [13]:
print('Number of data: {}'.format(len(data)))

Number of data: 941529


In [14]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
print('Number of data: {}'.format(len(data)))
print(data[:100])

Number of data: 939549
GANTOS INC <GTOS> 4TH QTR JAN 31 NETCHEMLAWN CORP, ECHOLAB INC SIGN DEFINITIVE MERGER AGREEMENTLDC F


In [15]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data = tokenizer.texts_to_sequences([data])[0]
print(len(encoded_data))
encoded_data[:5]

139107


[7877, 5, 4926, 15, 3]

In [16]:
# determine the vocabulary size
unique_words = tokenizer.word_index
# unique_words = np.unique(words) # alternative version
vocab_size = len(unique_words) + 1  # 0 is reserved for padding so that's why we added 1
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 31096


In [None]:

unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

**Next, we need to create sequences of words to fit the model with one word as input and one word as output.**

In [17]:
# create word -> word sequences
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(1, len(encoded_data) - WORD_LENGTH):
    prev_words.append(encoded_data[i:i + WORD_LENGTH])
    next_words.append(encoded_data[i + WORD_LENGTH])
print('Total Sequences: %d' % len(prev_words))

Total Sequences: 139101


**Running this piece shows that we have a total of 139.101 input-output pairs to train the network**

---



In [18]:
# list(len(prev_words)[:5]) # [input, output]



**We can then split the sequences into input (X) and output elements (y)**



In [19]:
# split into X and y elements
X = prev_words
X = np.array(X)
Y = next_words
Y = np.array(Y)

# X = np.zeros((len(prev_words), WORD_LENGTH, vocab_size), dtype=bool)
# Y = np.zeros((len(next_words), vocab_size), dtype=bool)

In [None]:
print(X[:5])
print(Y[:5])

In [14]:
# one hot encode outputs
Y = to_categorical(Y, num_classes=vocab_size)
# define model
Y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

**Build the model**

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1)) # original: 5
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             310960    
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 31096)             1585896   
                                                                 
Total params: 1,909,056
Trainable params: 1,909,056
Non-trainable params: 0
_________________________________________________________________
None


**Train the model**

In [23]:
# fit network
optimizer = RMSprop(learning_rate=0.01)

# compile network
#### since labels are INTEGERS, we need to changed from loss='categorical_crossentropy'!!!
#### If you want to provide labels using one-hot representation, please use CategoricalCrossentropy loss.
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) # optimizer ='adam'
history = model.fit(X, Y, batch_size=50, epochs=20, shuffle=True).history

## Alternative versions
# history = model.fit(X, Y, validation_split=0.05, batch_size=50, epochs=20, shuffle=True).history
# model.fit(X, Y, epochs=100)
# model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


**Save trained model**

In [None]:
# After successful training, we will save the trained model and just load it back as needed.
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))

model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

**Prediction**
Using saved model:
- we input the sample as a feature vector
- we convert the input string to a single feature vector

In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, vocab_size))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x
prepare_input("HOSPITAL CORP SAYS IT RECEIVED 47 DLR A SHARE OFFER FROM INVESTOR GROUP".lower())

In [None]:
# To choose the best possible "n" words after the prediction from the model ...
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
# Use the function predict_completions to predict and return the list of "n" predicted words.
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [None]:
# We use tokenizer.tokenize fo removing the punctuations and also we choose 5 first words because our predicts base on 5 previous words.
q =  "GILLETTE CANADA ISSUES 70 MLN STG BOND"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

**Creating a Prediction script**

In [None]:
from tensorflow.keras.models import load_model

# Load the model and tokenizer

model = history

def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [None]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""

# text1 = "at the dull"
# text2 = "collection of textile"
# text3 = "what a strenuous"
# text4 = "stop the script"

while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line: ilona
Enter your line: ISRAELI HELICOPTERS
Enter your line: GILLETTE CANADA ISSUES
