# Level 2 - Word Prediction using LSTM

Follow the same steps as in Char Prediction (Level 1) but at the word level than at the Char Level.

## Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import keras
# Sequence to attain Padding
from keras.preprocessing import sequence
# Importing RNN's LSTM
from keras.layers import LSTM, Dense, Dropout
from keras.layers import Embedding
# Applying Sequential algorithm to model
from keras.models import Sequential
import nltk
from nltk.tokenize import word_tokenize

Using TensorFlow backend.


## Storing the Document

In [2]:
file = open('AliceinWonderland.txt').read()

## Calculation the number of unique words in the document

In [3]:
# Stores the unique words from the document
words = word_tokenize(file)

# Stores the number of unique words which is the num_classes in outputs
unique_words = list(set(words))
unique_words_count = len(unique_words)
print(unique_words_count)

3721


## convert text words to int

In [4]:
# Neural Networks accepts only number inputs, so converting text(words) into numbers

## Maps words to numbers
word_to_int = dict(zip(unique_words, [i for i in range(len(unique_words))]))

## Maps numbers back to text
int_to_word = dict(zip([i for i in range(len(unique_words))], unique_words ))

In [15]:
''' SLIDING FUNCTION: Slides over the input text file words by words'''

def slider(data, slide):
    x = []
    y = []
    for i in range(len(data)-slide):
        x.append([word for word in data[i:i+slide]])
        y.append([data[i+slide]])
    return x,y

In [16]:
''' WORDS TO INT CONVERSION FUNCTION: Converts words dataset to int dataset '''

def word_data_to_int_data(x,y, word_to_int):
    input_int = []
    output_int = []
    
    for i in range(len(x)):
        input_int.append([word_to_int[word] for word in x[i]])
        output_int.append([word_to_int[word] for word in y[i]])
    return input_int,output_int

In [20]:
''' INTIALIZATION FUNCTION: Accepts tokenized words, slide, list of unique words from the doc '''

def main(data, slide, char_to_int):
    x, y = slider(data, slide)
    input_int, output_int = word_data_to_int_data(x, y, word_to_int)
    output_int = list(np.array(output_int).flatten())
    input_int = np.array(input_int).reshape(len(input_int),slide,1)
    return input_int,output_int

## Initializing

In [21]:
X,Y = main(words,100,word_to_int)

In [22]:
''' X=(163716, 100, 1) 

    Number of samples = 163716
    Number of inputs  = 100 (Letter1, Letter2...., Letter100)
               Output = 1 (Letter101th)
'''


X.shape

(38229, 100, 1)

In [23]:
len(Y)

38229

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.01, random_state=1)

### One-Hot-Encoding Output Values

In [26]:
# Total no. of classes = Unique Values in the document, [0,0,0,.....1]
y_train_oneHotEncoded = keras.utils.to_categorical(y_train, num_classes=unique_words_count)
y_test_oneHotEncoded = keras.utils.to_categorical(y_test, num_classes=unique_words_count)

In [27]:
x_train.shape

(37846, 100, 1)

In [28]:
y_train_oneHotEncoded.shape

(37846, 3721)

## LSTM Model

In [32]:
model = Sequential()
model.add(LSTM(64,input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(unique_words_count, activation="sigmoid"))
## Compiling Model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

## Fitting Model without weights(Wr or Wht-1)
model.fit(x_train, y_train_oneHotEncoded, batch_size=32, epochs=1, validation_data=(x_test, y_test_oneHotEncoded))

Train on 37846 samples, validate on 383 samples
Epoch 1/1


<keras.callbacks.History at 0x1f5d95043c8>

In [33]:
### Loading Weights
#model.load_weights('weights-improvement-49-1.2575.hdf5', by_name=False)

In [34]:
### Loading Dropout
#model.add(Dropout(32, input_shape=(x_train.shape[1], x_train.shape[2]))

In [35]:
predict = model.predict(x_test)

In [36]:
evaluate = model.evaluate(x_test, y_test_oneHotEncoded)



In [37]:
accuracy = evaluate[1]
accuracy*100

5.2219320837573653

## Test Input

In [55]:
test = file[:5000]

In [56]:
test_x, test_y = main(word_tokenize(test),100,word_to_int)

In [57]:
test_x.shape

(1016, 100, 1)

In [58]:
pre = model.predict_classes(test_x)

In [59]:
output = []
actual = []
for i,j in zip(pre,test_y):
    output.append(int_to_word[i])
    actual.append(int_to_word[j])

In [60]:
for i, j in zip(output,actual):
    print("predicted : ",i," Actual : ",j)

predicted :  ,  Actual :  Character
predicted :  ,  Actual :  set
predicted :  ,  Actual :  encoding
predicted :  ,  Actual :  :
predicted :  ,  Actual :  UTF-8
predicted :  ,  Actual :  ***
predicted :  ,  Actual :  START
predicted :  ,  Actual :  OF
predicted :  ,  Actual :  THIS
predicted :  ,  Actual :  PROJECT
predicted :  ,  Actual :  GUTENBERG
predicted :  ,  Actual :  EBOOK
predicted :  ,  Actual :  ALICE
predicted :  ,  Actual :  ’
predicted :  ,  Actual :  S
predicted :  ,  Actual :  ADVENTURES
predicted :  ,  Actual :  IN
predicted :  ,  Actual :  WONDERLAND
predicted :  ,  Actual :  ***
predicted :  ,  Actual :  ALICE
predicted :  ,  Actual :  ’
predicted :  ,  Actual :  S
predicted :  ,  Actual :  ADVENTURES
predicted :  ,  Actual :  IN
predicted :  ,  Actual :  WONDERLAND
predicted :  ,  Actual :  Lewis
predicted :  ,  Actual :  Carroll
predicted :  ,  Actual :  THE
predicted :  ,  Actual :  MILLENNIUM
predicted :  ,  Actual :  FULCRUM
predicted :  ,  Actual :  EDITION
pr

predicted :  ,  Actual :  then
predicted :  ,  Actual :  dipped
predicted :  ,  Actual :  suddenly
predicted :  ,  Actual :  down
predicted :  ,  Actual :  ,
predicted :  ,  Actual :  so
predicted :  ,  Actual :  suddenly
predicted :  ,  Actual :  that
predicted :  ,  Actual :  Alice
predicted :  ,  Actual :  had
predicted :  ,  Actual :  not
predicted :  ,  Actual :  a
predicted :  ,  Actual :  moment
predicted :  ,  Actual :  to
predicted :  ,  Actual :  think
predicted :  ,  Actual :  about
predicted :  ,  Actual :  stopping
predicted :  ,  Actual :  herself
predicted :  ,  Actual :  before
predicted :  ,  Actual :  she
predicted :  ,  Actual :  found
predicted :  ,  Actual :  herself
predicted :  ,  Actual :  falling
predicted :  ,  Actual :  down
predicted :  ,  Actual :  a
predicted :  ,  Actual :  very
predicted :  ,  Actual :  deep
predicted :  ,  Actual :  well
predicted :  ,  Actual :  .
predicted :  ,  Actual :  Either
predicted :  ,  Actual :  the
predicted :  ,  Actual :  

predicted :  ,  Actual :  ignorant
predicted :  ,  Actual :  little
predicted :  ,  Actual :  girl
predicted :  ,  Actual :  she
predicted :  ,  Actual :  ’
predicted :  ,  Actual :  ll
predicted :  ,  Actual :  think
predicted :  ,  Actual :  me
predicted :  ,  Actual :  for
predicted :  ,  Actual :  asking
predicted :  ,  Actual :  !
predicted :  ,  Actual :  No
predicted :  ,  Actual :  ,
predicted :  ,  Actual :  it
predicted :  ,  Actual :  ’
predicted :  ,  Actual :  ll
predicted :  ,  Actual :  never
predicted :  ,  Actual :  do
predicted :  ,  Actual :  to
predicted :  ,  Actual :  ask
predicted :  ,  Actual :  :
predicted :  ,  Actual :  perhaps
predicted :  ,  Actual :  I
predicted :  ,  Actual :  shall
predicted :  ,  Actual :  see
predicted :  ,  Actual :  it
predicted :  ,  Actual :  written
predicted :  ,  Actual :  up
predicted :  ,  Actual :  somewhere.
predicted :  ,  Actual :  ’
predicted :  ,  Actual :  Down
predicted :  ,  Actual :  ,
predicted :  ,  Actual :  down
