# NLP with LSTM Model
|Title|Link|
|-|-|
|Zipf's Law|https://github.com/Ashvith/Zipf-s-Law/blob/master/zipfs_law.ipynb|
|Next word prediction with NLP and deep learning|https://towardsdatascience.com/next-word-prediction-with-nlp-and-deep-learning-48b9fe0a17bf|

### Import the libraries

In [3]:
# Inbuilt libraries, NLTK and pandas
import pickle
import numpy as np
import os
import nltk
import pandas as pd     
# from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from nltk.tokenize import word_tokenize


# Tensorflow and Keras libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"


### Extracting all the words in the corpus
  
>If you're working with LSTM’s or other models which capture the semantic meaning and the meaning of a word depends on the context of the previous text, then it becomes important not to remove stopwords.

In [None]:
words_doc = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
# stop_words = set(stopwords.words('english'))
# nltk.corpus.gutenberg.fileids()


### Converting to lower case
>Again, we will NOT filter the stopwords out for the same reason mentioned above

In [None]:
data= [word.lower() for word in words_doc if word.isalpha()]
data = ' '.join(data)
# words_doc = [word for word in words_doc if word not in stop_words]

In [None]:
print(data)

In [None]:
# nltk.download('punkt')
tokens = word_tokenize(data)
train_len = 3+1
text_sequences = []

for i in range(train_len, len(tokens)):
  seq = tokens[i-train_len:i]
  text_sequences.append(seq)

print(text_sequences[:20])
sequences = {}
count = 1

In [None]:
for i in range(len(tokens)):
  if tokens[i] not in sequences:
    sequences[tokens[i]]=count
    count +=1

print(sequences)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

# sequence_data = tokenizer.texts_to_sequences([data])[0]
# print(sequence_data)
sequences = tokenizer.texts_to_sequences(text_sequences)
print(sequences[:20])

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

In [None]:
n_sequences = np.array(sequences)

for i in range(len(sequences)):
    n_sequences[i] = sequences[i]
print(n_sequences)

# print("The length of sequences are: ", len(sequences))
# sequences = np.array(sequences)
# print(sequences)
# # sequences[:10]

# m_sequences = np.empty([len(sequences),train_len], dtype='int32')
# for i in range(len(sequences)):
#     m_sequences[i] = sequences[i]
# print(m_sequences)


In [None]:
x_train = n_sequences[:, :-1]
y_train = n_sequences[:, -1]
y_train = to_categorical(y_train, num_classes=vocab_size)
seq_len = x_train.shape[1]
x_train.shape

In [None]:
# x_train = []
# y_train = []

# for i in sequences:
#     x_train.append(i[0])
#     y_train.append(i[1])

# x_train = np.array(x_train)
# y_train = np.array(y_train)

In [None]:
print("The data is:", x_train[:5])
print("The responses are:", y_train[:5])

In [None]:
# y_train = to_categorical(y_train, num_classes=vocab_size)
# y_train[:5]

In [None]:
model = Sequential([
    Embedding(vocab_size, seq_len, input_length=seq_len),
    LSTM(512, return_sequences=True),
    LSTM(512),
    Dense(512, activation="relu"),
    Dense(vocab_size, activation="softmax")
])

In [None]:
model.summary()

In [None]:
from keras.utils.vis_utils import plot_model
keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint =ModelCheckpoint("nextword3.h5", monitor='loss', verbose=1, save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [None]:
from tensorflow.keras.models import load_model
model = load_model("nextword2.h5")
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.0001), metrics="accuracy")

In [None]:
 model.fit(x_train, y_train, epochs=100, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# from tensorflow.keras.models import load_model
# import numpy as np
# import pickle

# model = load_model("nextword2.h5")
# # tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

# input_text = input().strip().lower()
# encoded_text = tokenizer.texts_to_sequences([input_text])[0]
# pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
# print(encoded_text, pad_encoded)
# for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
#   pred_word = tokenizer.index_word[i]
#   print("Next word suggestion:",pred_word)

In [None]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model("nextword2.h5")
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))
seq_len = pickle.load(open('seq_len1.pkl', 'rb'))

input_text = input().strip().lower()
encoded_text = tokenizer.texts_to_sequences([input_text])[0]
pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
print(encoded_text, pad_encoded)
for i in (model.predict(pad_encoded)[0]).argsort()[-3:][::-1]:
  pred_word = tokenizer.index_word[i]
  print("Next word suggestion:",pred_word)