In [1]:
import os
from bs4 import BeautifulSoup
import string
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential, load_model
from keras.layers.core import Dense, Activation
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from keras.utils import to_categorical
import pickle
from keras.optimizers import RMSprop
import numpy as np
from numpy import array
import heapq

**Import data**

In [3]:
documents = []
for file in os.listdir("sample_data/"): # original: "reuters_data/"
    if file.endswith('.sgm'): # it is important for GoogleColab
        filename = os.path.join("sample_data", file) # original: "reuters_data"
        f = open(filename, 'r', encoding='utf-8', errors='ignore')
        dataFile = f.read()
        
        soup = BeautifulSoup(dataFile, 'html.parser')
        contents = soup.findAll('title')
        
        for content in contents:
            documents.append(content.text)

In [7]:
print('Number of documents: {}'.format(len(documents)))

Number of documents: 20841


In [6]:
documents[:10]

['PAXAR CORP <PAKS> MAKES ACQUISITION',
 "<MARK'S WORK WEARHOUSE LTD> YEAR JAN 31 NET",
 'KEY TRONIC <KTCC> GETS NEW BUSINESS',
 'CANADIAN BASHAW, ERSKINE RESOURCES TO MERGE',
 'ENTOURAGE <ENTG> HAS FIRST QUARTER LOSS',
 '<MR. JAX FASHIONS INC> YEAR FEB 28 NET',
 'DIGITAL COMMUNICATIONS<DCAI> NAMES NEW PRESIDENT',
 'DIGITAL <DEC> IN TERADYNE <TER> LICENSING PACT',
 'HOME INTENSIVE <KDNY> EXTENDS DIALYSIS AT HOME',
 'BACHE CANADA BUYS TORONTO STOCK EXCHANGE SEAT']

**Join the documents**

In [9]:
data = ""
for d in documents:
    data += d

In [11]:
print('Number of data: {}'.format(len(data)))

Number of data: 941529


In [21]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
print('Number of data: {}'.format(len(data)))
print(data[:100])

Number of data: 939549
PAXAR CORP <PAKS> MAKES ACQUISITION<MARK'S WORK WEARHOUSE LTD> YEAR JAN 31 NETKEY TRONIC <KTCC> GETS


In [22]:
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data= tokenizer.texts_to_sequences([data])[0]
print(len(encoded_data))
encoded_data[:5]

139107


[7877, 8, 3599, 230, 413]

In [24]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # 0 is reserved for padding so that's why we added 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 31097


31097

**Next, we need to create sequences of words to fit the model with one word as input and one word as output.**

In [25]:
# create word -> word sequences
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(1, len(encoded_data) - WORD_LENGTH):
    prev_words.append(encoded_data[i:i + WORD_LENGTH])
    next_words.append(encoded_data[i + WORD_LENGTH])
print('Total Sequences: %d' % len(prev_words))

Total Sequences: 139101


**Running this piece shows that we have a total of 139.101 input-output pairs to train the network**

---



In [31]:
# list(len(prev_words)[:5]) # [input, output]



**We can then split the sequences into input (X) and output elements (y)**



In [32]:
# split into X and y elements
X = prev_words
Y = next_words

In [33]:
print(X[:5])
print(Y[:5])

[[8, 3599, 230, 413, 7878], [3599, 230, 413, 7878, 851], [230, 413, 7878, 851, 4923], [413, 7878, 851, 4923, 66], [7878, 851, 4923, 66, 24]]
[851, 4923, 66, 24, 49]


In [34]:
# one hot encode outputs
Y = to_categorical(Y, num_classes=vocab_size)
# define model
Y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

**Build the model**

In [35]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1)) # original: 5
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             310970    
                                                                 
 lstm (LSTM)                 (None, 50)                12200     
                                                                 
 dense (Dense)               (None, 31097)             1585947   
                                                                 
Total params: 1,909,117
Trainable params: 1,909,117
Non-trainable params: 0
_________________________________________________________________
None


**Train the model**

In [None]:
# fit network
optimizer = RMSprop(lr=0.01)

# compile network
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) # optimizer ='adam'
history = model.fit(X, Y, validation_split=0.05, batch_size=50, epochs=20, shuffle=True).history
# model.fit(X, Y, epochs=100) # alternative version