In [1]:
import os
from bs4 import BeautifulSoup
import string
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.models import load_model
from keras.layers.core import Dense, Activation
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from keras.utils import to_categorical
import pickle
from keras.optimizers import RMSprop
import numpy as np
from numpy import array
import heapq

**Import data**

In [2]:
documents = []
for file in os.listdir("sample_data/"): # original: "reuters_data/"
    if file.endswith('.sgm'): # it is important for GoogleColab
        filename = os.path.join("sample_data", file) # original: "reuters_data"
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read().lower()
        
        soup = BeautifulSoup(dataFile, 'html.parser')

        ## get all 'topic'
        # topics = {topic.name for topic in soup.find_all()}
  
        ## iterate all 'topic'
        # for topic in topics:
          
        ## find all element of 'topic'
          # for i in soup.find_all('topic'):
  
        ## if tag has attribute of class
            # if i.has_attr('trade'):
  
# We have selected the following 20 'TOPICS' out of 135:
# that we want to use for our prediction exercise: 1.) "trade" 2.) "earn" 3.) "grain" 4.) "money-fx" 5.) 
# "coffee" 6.) "gold" 7.) "acq" 8.) "wheat" 9.) "veg-oil" 10.) "nat-gas" 11.) "cooper" 12.) "ship" 13.) 
# "dlr" 14.) "crude" 15.) "interest" 16.) "meal-feed" 17.) "alum" 18.) "money-supply" 19.) "cocoa" 20.) "livestock"
        contents = soup.findAll('title')
        
        for content in contents:
            documents.append(content.text)

In [3]:
print('Number of documents: {}'.format(len(documents)))

Number of documents: 9953


In [4]:
# Remove duplicated strings from the list of strings
documents = [i for n, i in enumerate(documents) if i not in documents[:n]]

print('Number of documents: {}'.format(len(documents)))

Number of documents: 9599


In [5]:
documents[:10]

['ibc coffee auctions to start soon - dauster',
 'amca (ail) names new chairman',
 'api oil inventory report to be issued tonight',
 'soro group to limit fairchild <fen> stock buys',
 'fed adds reserves via two-day repurchases',
 'capital associates <caii.o> to trade on nasdaq',
 'mint reviews offers on 3,701,000 lbs copper',
 'ahed <ahm.to> may issue one mln shares',
 'u.s. west <usw> introduces data network product',
 'french winter cereal sowing seen little changed']

**Join the documents**

In [6]:
data = ""
for d in documents:
    data += d

In [7]:
print('Number of data: {}'.format(len(data)))

Number of data: 435763


In [8]:
# improve punctuation
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
print('Number of data: {}'.format(len(data)))
print(data[:100])

Number of data: 434881
ibc coffee auctions to start soon - dausteramca (ail) names new chairmanapi oil inventory report to 


In [9]:
translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
data = data.translate(translator)
print(data[:100])

ibc coffee auctions to start soon   dausteramca  ail  names new chairmanapi oil inventory report to 


In [10]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data = tokenizer.texts_to_sequences([data])[0]
print(len(encoded_data))
encoded_data[:5]

64481


[1570, 209, 1974, 1, 444]

In [11]:
# determine the vocabulary size
# unique_words = tokenizer.word_index
unique_words = np.unique(encoded_data)
vocab_size = len(unique_words) + 1  # 0 is reserved for padding so that's why we added 1
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 17323


**Next, we need to create sequences of words to fit the model with one word as input and one word as output.**

In [12]:
# create word -> word sequences
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(1, len(encoded_data) - WORD_LENGTH):
    prev_words.append(encoded_data[i:i + WORD_LENGTH])
    next_words.append(encoded_data[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

[209, 1974, 1, 444, 990]
4413


In [13]:
print('Total Sequences: %d' % len(prev_words))

Total Sequences: 64475


In [14]:
# list(len(prev_words)[:5]) # [input, output]



**We can then split the sequences into input (X) and output elements (y)**



In [15]:
# split into X and y elements
X = prev_words
X = np.array(X)
Y = next_words
Y = np.array(Y)

# X = np.zeros((len(prev_words), WORD_LENGTH, vocab_size), dtype=bool)
# Y = np.zeros((len(next_words), vocab_size), dtype=bool)

In [16]:
print(X[:5])
print(Y[:5])

[[ 209 1974    1  444  990]
 [1974    1  444  990 4413]
 [   1  444  990 4413 2675]
 [ 444  990 4413 2675  106]
 [ 990 4413 2675  106   21]]
[4413 2675  106   21 4414]


In [None]:
# one hot encode outputs
Y = to_categorical(Y, num_classes=vocab_size)
# define model
Y[:5]

In [20]:
X.shape
Y.shape

(64475, 17323)

**Build the model**

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1)) # original: 5
model.add(LSTM(32))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             173230    
                                                                 
 lstm (LSTM)                 (None, 32)                5504      
                                                                 
 dense (Dense)               (None, 17323)             571659    
                                                                 
Total params: 750,393
Trainable params: 750,393
Non-trainable params: 0
_________________________________________________________________
None


**Train the model**

In [23]:
# fit network
optimizer = RMSprop(learning_rate=0.01)

# compile network
#### since labels are INTEGERS, we need to changed from loss='categorical_crossentropy'!!!
#### If you want to provide labels using one-hot representation, please use CategoricalCrossentropy loss.
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # optimizer ='adam'
model.fit(X, Y, epochs=20)

## Alternative versions
# history = model.fit(X, Y, validation_split=0.05, batch_size=50, epochs=20, shuffle=True).history
# history = model.fit(X, Y, validation_split=0.05, batch_size=50, epochs=20, shuffle=True).history
# model.fit(X, Y, epochs=100)
# model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/20




Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f77da250220>

**Save trained model**

In [26]:
# After successful training, we will save the trained model and just load it back as needed.
model.save('keras_next_word_model.h5')

In [None]:
pickle.dump(history, open("history.p", "wb"))
model = load_model('keras_next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

**Prediction**
Using saved model:
- we input the sample as a feature vector
- we convert the input string to a single feature vector

In [None]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, vocab_size))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x
prepare_input("HOSPITAL CORP SAYS IT RECEIVED 47 DLR A SHARE OFFER FROM INVESTOR GROUP".lower())

In [None]:
# To choose the best possible "n" words after the prediction from the model ...
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [None]:
# Use the function predict_completions to predict and return the list of "n" predicted words.
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [None]:
# We use tokenizer.tokenize fo removing the punctuations and also we choose 5 first words because our predicts base on 5 previous words.

q =  "GILLETTE CANADA ISSUES 70 MLN STG BOND"

## 20 EXAMPLES FOR EVALUATION:
"""
'AMES DEPARTMENT STORE <ADD> MARCH SALES UP'
'ISRAELI HELICOPTERS RAID SOUTH LEBANON - RADIO'
'GILLETTE CANADA ISSUES 70 MLN STG BOND'
'DIGITAL COMMUNICATIONS <DCAI> SELLS SWITCHES'
'ITALIAN TREASURY BILL OFFER MEETS MIXED DEMAND'
'WESTLAND TO CUT A THIRD OF HELICOPTER WORKFORCE'
'USDA DETAILS FREE GRAIN STOCKS UNDER LOAN'
'FED SAYS U.S. DISCOUNT WINDOW BORROWINGS 361 MLN DLRS IN APRIL 8 WEEK'
'HOSPITAL CORP SAYS IT RECEIVED 47 DLR A SHARE OFFER FROM INVESTOR GROUP'
'FED SEEN BUYING DOLLARS FOR YEN IN OPEN MARKET'
'DOLLAR ENDS LOWER IN LACKLUSTRE FRANKFURT'
'HEALTH AND REHABILITATION <HRP> INITIAL PAYOUT'
'SUPERMARKETS GENERAL <SGL> FIVE WEEK SALES'
'SEKISUI CHEMICAL ISSUES EQUITY WARRANT EUROBOND'
'WEST GERMAN BEET PLANTINGS DELAYED THREE WEEKS'
'BURMAH OIL PROSPECTS REMAIN FAVOURABLE'
'PARKER DRILLING CO <PKD> 2ND QTR FEB 28 LOSS'
'TURKEY CALLS FOR DIALOGUE TO SOLVE DISPUTE'
'INVESTMENT TECHNOLOGIES <IVES> IN REBATE PACT'
'ENTOURAGE <ENTG> HAS FIRST QUARTER LOSS'
"""

print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

**Creating a Prediction script**

In [None]:
from tensorflow.keras.models import load_model

# Load the model and tokenizer

model = history

def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [None]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""

# text1 = "at the dull"
# text2 = "collection of textile"
# text3 = "what a strenuous"
# text4 = "stop the script"

while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line: ilona
Enter your line: ISRAELI HELICOPTERS
Enter your line: GILLETTE CANADA ISSUES
