###Loading Google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###Importing necessary files

In [2]:
import keras
import tensorflow as tf
import os
import numpy as np
from keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers

In [3]:
import nltk
import re
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


###Reading the dataset uploaded on drive

In [4]:
data = open("/content/drive/MyDrive/Colab Notebooks/NLP task.txt", 'rb').read().decode(encoding='utf-8')

###Sample of the dataset

In [5]:
print(data[:250])

﻿

Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of 


###Filtering Unwanted symbols and taking only words

In [6]:
text = re.sub('[\W]+',' ', data )
text = re.sub('[½àâæèéœ—‘’“”\ufeff]+','', text )

#Word Level 

###Encoding
Creating n-grams of sentences and adding next word as output to create training data for the model. 

In [7]:
tokenizer = Tokenizer()

In [8]:
sentences = data.lower().split(".")
for i in range(0,len(sentences)):
  s = re.sub(' +',' ',(re.sub(r'[^\w]', ' ', sentences[i])))
  sentences[i] = s
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

In [9]:
input_sequences = []
for line in sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [10]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [11]:
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = to_categorical(label, num_classes=total_words)

###Building the Model
Building a model with bidirectional model with first embedding layer, dropoug for regularization and softmax layer for output

In [82]:
model = tf.keras.Sequential()
model.add( tf.keras.layers.Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add( tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(150, return_sequences = True)))
model.add( tf.keras.layers.Dropout(0.2))
model.add( tf.keras.layers.LSTM(100))
model.add( tf.keras.layers.Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add( tf.keras.layers.Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 104, 64)           522944    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 104, 300)          258000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 104, 300)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_3 (Dense)              (None, 4085)              412585    
_________________________________________________________________
dense_4 (Dense)              (None, 8171)              33386706  
Total params: 34,740,635
Trainable params: 34,740,635
Non-trainable params: 0
__________________________________________

In [None]:
history = model.fit(predictors, label, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/SavedModel1')

In [101]:
model = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/SavedModel1')

Generating output based on input given by user

In [84]:
def predictsentence(seed_text,words):
  for _ in range(words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    seed_text += " " + output_word
  return(seed_text)

In [104]:
inp = input()
print(predictsentence(inp,100))

Sherlock Holmes is in the
Sherlock Holmes is in the door and finally wandered up by the matter where he had been able to tell me that he had been able to tell me that he had been in a very serious extent down by a very serious deal broken of a very serious extent down from his hands and his hands was sufficient to open the room and his hands was sufficient to open the room which he had had been with a very serious deal upon the room and his hands had been in his hands while he spoke the door of the room and the matter of


In [105]:
inp = input()
print(predictsentence(inp,100))

Watson was going to
Watson was going to night in the door that i was not sure that he had been in a very serious extent down by a very serious extent down from his hands and his hands was sufficient to open the room which he had had been with a very serious deal upon the room and his hands had been in his hands while he spoke the door of the room and the matter of a very large villa sweep with a very interesting study a very large boot upon his hands while he spoke in the room behind him and the matter of a


#Character Level 

###Encoding
Creating indexes for each charater to encode and give as input to the model


In [53]:
vocab = sorted(set(text))

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
  return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [54]:
print("Text:", text[:9])
print("Encoded:", text_to_int(text[:9]))

Text:  Project 
Encoded: [ 0 26 55 52 47 42 40 57  0]


In [55]:
def int_to_text(ints):
  try:
    ints = ints.numpy()
  except:
    pass
  return ''.join(idx2char[ints])


In [56]:
seq_length = 100  
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [57]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [58]:
def split_input_target(chunk):  
    input_text = chunk[:-1]  
    target_text = chunk[1:]  
    return input_text, target_text  

dataset = sequences.map(split_input_target)  

In [59]:
for x, y in dataset.take(2):
  print("EXAMPLE\n")
  print("INPUT")
  print(int_to_text(x))
  print("\nOUTPUT")
  print(int_to_text(y))

EXAMPLE

INPUT
 Project Gutenberg s The Adventures of Sherlock Holmes by Arthur Conan Doyle This eBook is for the u

OUTPUT
Project Gutenberg s The Adventures of Sherlock Holmes by Arthur Conan Doyle This eBook is for the us
EXAMPLE

INPUT
e of anyone anywhere at no cost and with almost no restrictions whatsoever You may copy it give it a

OUTPUT
 of anyone anywhere at no cost and with almost no restrictions whatsoever You may copy it give it aw


In [60]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

###Building the Model
Building a simple model with embedding layer, lstm layer and an output layer


In [87]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(VOCAB_SIZE,EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (64, None, 256)           16384     
_________________________________________________________________
lstm_5 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_5 (Dense)              (64, None, 64)            65600     
Total params: 5,328,960
Trainable params: 5,328,960
Non-trainable params: 0
_________________________________________________________________


In [62]:
for input_example_batch, target_example_batch in data.take(1):
  example_batch_predictions = model(input_example_batch) 
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)") 

(64, 100, 64) # (batch_size, sequence_length, vocab_size)


In [63]:
pred = example_batch_predictions[0]

In [64]:
sampled_indices = tf.random.categorical(pred, num_samples=1)

sampled_indices = np.reshape(sampled_indices, (1, -1))[0]
predicted_chars = int_to_text(sampled_indices)

predicted_chars  

'RtXB2SC9GV3MH0OqDDw8uV41AW6y9B3knASXHL5OPvvGjrPmDksDoVb6yO5m6yz8eUZc0MQQDv6DgaPHrHtcLZC_ig9IXKRV7xVw'

In [88]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

###Compiling the Model
Compiling model with adam optimizer and sparse categorical crossentropy



In [89]:
model.compile(optimizer='adam', loss=loss)

###Creating Checkpoints
For saving model and creating a new model for output

In [90]:
checkpoint_dir = './training_checkpoints'

In [91]:
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
history = model.fit(data, epochs=50, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


###Loading the Model


In [92]:
!unzip training_checkpoints.zip

Archive:  training_checkpoints.zip
   creating: training_checkpoints/
  inflating: training_checkpoints/checkpoint  
  inflating: training_checkpoints/ckpt_50.data-00000-of-00001  
  inflating: training_checkpoints/ckpt_50.index  


In [93]:
predict_model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [94]:
predict_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
predict_model.build(tf.TensorShape([1, None]))

###Generating Text
Based on input given by user


In [95]:
def generate_text(model, start_string):
  num_generate = 1000

  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []
  temperature = 1.0

  predict_model.reset_states()
  for i in range(num_generate):
      predictions = predict_model(input_eval)
    
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [98]:
inp = input()
print(generate_text(model, inp))

Sherloc
Sherlock to the house As you can cover would be for you and how all important it was to settle down to a quiet and respectable life I bought this estate which could not inve lost away from her and soon overtook Frank We got into a cab together and away we can t say that she came but I know where the gems of hours whereabouts of the missing lady There are running to the sitting room Now where you worked ve ling at me Was the window open Yes Then he might have called to as usual snake indication of the course of events is concluded the stone pavemen Mary among the Project Gutenberg tm trademark Contact the Foundation as set forth in Section 3 business XI THE ADVENTURE OF THE BE Onything from men s mind and pred knock with the clothes which had been taking up by my belief Than you And now I shall have to talkee at that right out of the case and the mactions which you have given to mine Totter and the money would be the middle hudge my companion rose to draw I will told my surprist

In [100]:
inp = input()
print(generate_text(model, inp))

Watso
Watson with the de keeper he Westbury He presented she meet her and he was in a pitiable state of reaction with every nerve in a fay yet I cannot say that it is locked upon the Hatherley shrinked the shutters folding up the blow fell Still open in fact that I had some shoes and glate close to that occupary Well so carry out its mission was far from the house I ked and a long silence but he was of me Mr Holmes and the little things that there would be no prosecution Out there are very dinning to that When I shall wonce Mr Wilson said my assistant and he is willing to fill a vacancy in the Leagua of the Discovered him up and a hundred yards from the company once my profession He has one for us Welle confided was more than I have added the assistant papers I shall communicate with you alone I rose to lose a black lix Englishment quite se at seven We must assiristing pink coloured viewed copright lawy crossed it was close t the result from amid the common crowd of mendicants and so t