In [324]:
import tensorflow as tf
import numpy as np
import nltk
import urllib.request
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Dropout, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

### Getting text

In [325]:
def get_text(url,start_ind):
  file = urllib.request.urlopen(url)
  text = [line.decode('utf-8') for line in file]
  text = ''.join(text)
  # text = re.sub('<[^<]+?>', '', text)
  text = re.sub(r'[^A-Za-z.,!? ]+', ' ', text)
  text = re.sub(' +',' ',text)
  text = text[start_ind:]
  return text.lower()

In [326]:
url = "https://www.gutenberg.org/files/11/11-0.txt"
text = get_text(url,1338)

### Text preprocessing

lemmatization

In [327]:
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize,sent_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
lem_text = [wnl.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else wnl.lemmatize(i) for i,j in pos_tag(word_tokenize(text))]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [328]:
' '.join(lem_text)

'down the rabbit hole alice be begin to get very tired of sit by her sister on the bank , and of have nothing to do once or twice she have peep into the book her sister be read , but it have no picture or conversation in it , and what be the use of a book , think alice without picture or conversation ? so she be consider in her own mind a well a she could , for the hot day make her feel very sleepy and stupid , whether the pleasure of make a daisy chain would be worth the trouble of get up and pick the daisy , when suddenly a white rabbit with pink eye run close by her . there be nothing so very remarkable in that nor do alice think it so very much out of the way to hear the rabbit say to itself , oh dear ! oh dear ! i shall be late ! when she think it over afterwards , it occur to her that she ought to have wonder at this , but at the time it all seem quite natural but when the rabbit actually take a watch out of it waistcoat pocket , and look at it , and then hurry on , alice start t

Delete stop words

In [329]:
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
text_cl = [word for word in lem_text if word not in stopwords]

In [330]:
clean_text = ' '.join(text_cl)

Sentance tokenization

In [331]:
text_sent_tok = sent_tokenize(clean_text)

Sequencing N-grams

In [332]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(text_sent_tok)

In [333]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

### Creating model

In [334]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()

    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100,return_sequences=True))
    model.add(Dropout(0.2))

    # Add Hidden Layer 2 - LSTM Layer
    model.add(LSTM(50))
    model.add(Dropout(0.2))

    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=1.0e-2))

    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 84, 10)            22510     
                                                                 
 lstm_15 (LSTM)              (None, 84, 100)           44400     
                                                                 
 dropout_13 (Dropout)        (None, 84, 100)           0         
                                                                 
 lstm_16 (LSTM)              (None, 50)                30200     
                                                                 
 dropout_14 (Dropout)        (None, 50)                0         
                                                                 
 dense_10 (Dense)            (None, 2251)              114801    
                                                                 
Total params: 211,911
Trainable params: 211,911
Non-t

In [None]:
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, verbose=1),
             tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.05, patience=5, verbose=1,)]

model.fit(predictors, label, batch_size=32, callbacks=callbacks,epochs=30, verbose=1)

In [308]:
model.evaluate(x=predictors, y=label)



5.451775550842285

In [315]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predict_x=model.predict(token_list)
        classes_x=np.argmax(predict_x,axis=1)

        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes_x:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [344]:
generate_text('Tea is ',10,model,max_sequence_len)



'Tea Is  Wrap Wrap Permission Frighten Frighten Adoption Adoption Adoption Adoption Light'