### Install and import Libraries and Download necessary language modelling resources

In [None]:
# install librairies unavailable in Colab
!pip install contractions
!pip install keras_tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [None]:
# import libraries
import contractions
import html
import keras
import keras_tuner as kt
import numpy as np
import os
import pandas as pd
import re
import random
import tensorflow as tf

from keras.callbacks import EarlyStopping

from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers import LSTM

from keras.models import Sequential

from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.optimizers import SGD

from keras.preprocessing.text import Tokenizer

from keras.utils import pad_sequences
from keras.utils import to_categorical

from numpy.random import randint
from numpy.random import RandomState


In [None]:
# download resources unavailable in colab
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Load text data from Google Drive

In [None]:
# to access google drive folder
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# unzip the folder containing the text data
!unzip '/content/drive/MyDrive/Deep Learning Coursework Data/SciFi.zip'

Archive:  /content/drive/MyDrive/Deep Learning Coursework Data/SciFi.zip
  inflating: internet_archive_scifi_v3.txt  


### Read in the text Data

In [None]:
# read in the text data
text_original = open('/content/internet_archive_scifi_v3.txt', 'rb').read()
text = text_original.decode(encoding='utf-8')

### Preprocessing of the text data

1. Split the whole paragraph into sentences.   
The function below splits the paragraph according to punctuations: ?, ., !, #

In [None]:
# split the text into sentences according to ?, . , !, #
sentences = re.split('[?.!#]', text)

2. Clean the sentences by:

*   Converting to lower cases
*   Removing hyperlinks
*   Converting html character to string
*   Convert contracted words to their full form
*   Remove Punctuations
*   Remove Single characters
*   Remove Multiple spaces





In [None]:
def preprocess(sentences):
  for i in range(len(sentences)):
    sentences[i] = sentences[i].lower() # convert text to lower case
    sentences[i] = re.sub(r'http\S+', '', sentences[i]) # remove hyperlinks
    sentences[i] = html.unescape(sentences[i]) # convert html to string
    sentences[i] = contractions.fix(sentences[i]) # convert contractions to full
    sentences[i] = re.sub(r"[^\w\s]",'', sentences[i]) # remove punctuations
    sentences[i] = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentences[i]) # remove single characters
    sentences[i] = re.sub(r'\s+', ' ', sentences[i]) # remove multiple spaces
  return sentences

In [None]:
sentences_preprocessed = preprocess(sentences)

3. Remove any sentences that are empty or contain few characters

In [None]:
# finding indexes of empty or few characters sentence
indexes = []
for i in range(len(sentences_preprocessed)):
  if len(sentences_preprocessed[i]) < 10:
    indexes.append(i)

In [None]:
# deleting empty or single characters based on the indexes found above
reverse_index_list = sorted(indexes, reverse=True)

for index in reverse_index_list:
   if index < len(sentences_preprocessed):   # check if index is less than the len(sentences_preprocessed)
      sentences_preprocessed.pop(index) # removing sentence by index

4. Selecting a subset of the dataset  
Choose 5000 sentences randomly

In [None]:
# randomly choose 5000 indexes to keep
seed = RandomState(123)
index_to_keep = seed.randint(0, 2200764, 5000)

In [None]:
# random subset of 5000 sentences
sentences_subset = []
sorted_list = sorted(index_to_keep, reverse=False)
for index in sorted_list:
  sentences_subset.append(sentences_preprocessed[index])

### Converting input data into numerical representation to be fed to the model

#### Tokenization
Representing the word sentences in numeric form before being fed to the model. 

In [None]:
# tokenizing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_subset)
n_words = len(tokenizer.word_index) + 1 # number of unique words in the data

In [None]:
# convert each sentence into X and Y
input_sequences = []
for line in sentences_subset:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence) 

#### Padding 
Making the sentences of the same length

In [None]:
# choosing maximum length of one sentence to be 30
max_len = 30
input_sequences_padded = np.array(pad_sequences(input_sequences, maxlen=max_len, padding='pre'))

In [None]:
X, Y = input_sequences_padded[:,:-1], input_sequences_padded[:,-1]

# one-hot encoding Y
Y_encoded = to_categorical(Y, num_classes=n_words)

In [None]:
Y_encoded.shape[1]

10407

### Load GloVe Embedding matrix
The GloVe pre-trained embedding matrix should be downloaded from: https://nlp.stanford.edu/data/glove.6B.zip and uploaded to Google Drive

In [None]:
# unzipping the Glove word vectors
!unzip '/content/drive/MyDrive/Deep Learning Coursework Data/glove.6B.zip'

Archive:  /content/drive/MyDrive/Deep Learning Coursework Data/glove.6B.zip
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       
  inflating: glove.6B.50d.txt        


In [None]:
# load GloVe word vectors
embeddings_index = {}
f = open('/content/glove.6B.200d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vectors representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

print('GloVe data loaded')

GloVe data loaded


In [None]:
# create embedding matrix: 
hits = 0
misses = 0
embedding_dim = 200
word_index = tokenizer.word_index 

embedding_matrix = np.zeros((n_words, embedding_dim))
for word, i in word_index.items():
  try:
    embedding_vector = embeddings_index[word]
  except:
    misses += 1
    pass
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector
    hits += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 10406 words (1092 misses)


### Building Model
The LSTM model consists of:


1.   Embedding Layer with pre-trained embedding matrix 
2.   LSTM layer with ReLU activation function
3.   Dropout Layer 
4.   LSTM layer with ReLU activation function
5.   Dropout Layer
6.   Output Layer with Softmax function



### Hyperparameter Tuning of the LSTM Model

Before training the model on the full dataset, we will use a subset of it to find the optimal hyperparameters of the LSTM Model.


In [None]:
def hyperparameter_tune_LSTM(hp):
  
  # define hyperparameters
  hp_units_1 = hp.Int('units_1', min_value=64, max_value=1024, step=64)
  hp_units_2 = hp.Int('units_2', min_value=64, max_value=1024, step=64)
  hp_dropout1_rate = hp.Choice('Rate1', values = [0.2, 0.3, 0.4, 0.5])
  hp_dropout2_rate = hp.Choice('Rate2', values = [0.2, 0.3, 0.4, 0.5])
  hp_optimizer = hp.Choice('optimizer', values = ['SGD', 'Adam', 'RMSProp'])
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 1e-5])
  
  # input length
  input_len = max_len - 1

  # define model
  model = Sequential()

  # Embedding Layer
  model.add(Embedding(n_words, 200, input_length=input_len, weights=[embedding_matrix]))

  # LSTM Layer 1
  model.add(LSTM(hp_units_1, return_sequences=True, activation='relu'))

  # Drop out layer 1
  model.add(Dropout(hp_dropout1_rate))
  
  # LSTM Layer 2
  model.add(LSTM(hp_units_2, return_sequences=False, activation='relu'))
  
  # Drop out Layer 2
  model.add(Dropout(hp_dropout2_rate))

  # Output layer
  model.add(Dense(n_words, activation='softmax'))

  # Optimizer
  if hp_optimizer == 'SGD':
    opt = SGD(learning_rate=hp_learning_rate)
  elif hp_optimizer == 'Adam':
    opt = Adam(learning_rate=hp_learning_rate)
  else: 
    opt = RMSprop(learning_rate=hp_learning_rate)

  # compilation
  model.compile(optimizer=opt, loss="categorical_crossentropy", metrics=['accuracy'])

  return model  


In [None]:
# hyper parameter tuning
tuner = kt.BayesianOptimization(hyperparameter_tune_LSTM, 
                                objective='loss',
                                max_trials=20,
                                seed=0,
                                overwrite=True,
                                directory='dir',
                                project_name='x')

# early stopping
stop_early = EarlyStopping(monitor='loss', patience=2)

tuner.search(X, Y_encoded, callbacks=[stop_early])

Trial 20 Complete [00h 04m 27s]
loss: 9.242110252380371

Best loss So Far: 7.1856207847595215
Total elapsed time: 01h 41m 57s


In [None]:
# get the best model
best_model = tuner.get_best_models(num_models=1)[0]
best_model.summary()

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0].values
best_hps



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29, 200)           2081400   
                                                                 
 lstm (LSTM)                 (None, 29, 640)           2152960   
                                                                 
 dropout (Dropout)           (None, 29, 640)           0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               918528    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 10407)             2674599   
                                                                 
Total params: 7,827,487
Trainable params: 7,827,487
Non-

{'units_1': 640,
 'units_2': 256,
 'Rate1': 0.5,
 'Rate2': 0.4,
 'optimizer': 'Adam',
 'learning_rate': 0.001}

### Building the LSTM Model

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1

    model = Sequential()
    model.add(Embedding(total_words, 200, input_length=input_len, weights=[embedding_matrix]))
    model.add(LSTM(640, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(256, return_sequences=False))
    model.add(Dropout(0.4))
    model.add(Dense(total_words, activation='softmax'))
    opt = Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# Callbacks

# checkpoint directory
checkpoint_dir = "/content/drive/MyDrive/Task3_Checkpoints/Trial 6"

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix)

# set seed for reproducibility
random.seed(0)
tf.random.set_seed(0)

# fitting the model
model = create_model(max_len, n_words)
# model.summary() # uncomment to print out the whole model architecture
history = model.fit(X, Y_encoded, epochs=100, verbose=1, callbacks=[model_checkpoint_callback])

Epoch 1/100



Epoch 2/100



Epoch 3/100



Epoch 4/100



Epoch 5/100



Epoch 6/100



Epoch 7/100



Epoch 8/100



Epoch 9/100



Epoch 10/100



Epoch 11/100



Epoch 12/100



Epoch 13/100



Epoch 14/100



Epoch 15/100



Epoch 16/100



Epoch 17/100



Epoch 18/100



Epoch 19/100



Epoch 20/100



Epoch 21/100



Epoch 22/100



Epoch 23/100



Epoch 24/100



Epoch 25/100



Epoch 26/100



Epoch 27/100



Epoch 28/100



Epoch 29/100



Epoch 30/100



Epoch 31/100



Epoch 32/100



Epoch 33/100



Epoch 34/100



Epoch 35/100



Epoch 36/100



Epoch 37/100



Epoch 38/100



Epoch 39/100



Epoch 40/100



Epoch 41/100



Epoch 42/100



Epoch 43/100



Epoch 44/100



Epoch 45/100



Epoch 46/100



Epoch 47/100



Epoch 48/100



Epoch 49/100



Epoch 50/100



Epoch 51/100



Epoch 52/100



Epoch 53/100



Epoch 54/100



Epoch 55/100



Epoch 56/100



Epoch 57/100



Epoch 58/100



Epoch 59/100



Epoch 60/100



Epoch 61/100



Epoch 62/100



Epoch 63/100



Epoch 64/100



Epoch 65/100



Epoch 66/100



Epoch 67/100



Epoch 68/100



Epoch 69/100



Epoch 70/100



Epoch 71/100



Epoch 72/100



Epoch 73/100



Epoch 74/100



Epoch 75/100



Epoch 76/100



Epoch 77/100



Epoch 78/100



Epoch 79/100



Epoch 80/100



Epoch 81/100



Epoch 82/100



Epoch 83/100



Epoch 84/100



Epoch 85/100



Epoch 86/100



Epoch 87/100



Epoch 88/100



Epoch 89/100



Epoch 90/100



Epoch 91/100



Epoch 92/100



Epoch 93/100



Epoch 94/100



Epoch 95/100



Epoch 96/100



Epoch 97/100



Epoch 98/100



Epoch 99/100



Epoch 100/100





### Testing the LSTM Model

In [None]:
def preprocess_test(sentences):
  sentences = sentences.lower() # convert text to lower case
  sentences = re.sub(r'http\S+', '', sentences) # remove hyperlinks
  sentences = html.unescape(sentences) # convert html to string
  sentences = contractions.fix(sentences) # convert contractions to full
  sentences = re.sub(r"[^\w\s]",'', sentences) # remove punctuations
  sentences = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentences) # remove single characters
  sentences = re.sub(r'\s+', ' ', sentences) # remove multiple spaces
  return sentences

In [None]:
def generate_text(input_text, num_words, model, max_sequence_len):
  max_len = 30
  for _ in range(num_words):
    # pre-processing the input text
    input_cleaned = preprocess_test(input_text)
    token_list = tokenizer.texts_to_sequences([input_cleaned])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    
    # generate predictions
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    
    # convert predictions to word
    output_word = ""
    for word,index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    input_text += " "+output_word
  return input_text

In [None]:
print(generate_text("She shook her head regretfully.", 30, model, max_len))

She shook her head regretfully. slowly her smile fading her eyes was beaklike almost out of her own body was that deliberate above the light and looked up over the light and looked at him


In [None]:
print(generate_text("She might", 16, model, max_len))

She might even tell you that if you turn it in you will get passing grade for sure


In [None]:
print(generate_text("It would certainly", 30, model, max_len))

It would certainly be done secretly by one crackpot though she was looking for short cuts one and have an dreamer to sell were pushing their wares speedily lest the market decline your


### Inference with user input

1. Run all codes under:
*   Install and import Libraries and Download necessary language modelling resources
*   Load text data from Google Drive
*   Read in the text Data
*   Preprocessing of the text data
*   Converting input data into numerical representation to be fed to the model

2. Load the saved model
3. Run the functions required to generate text
4. Predict using the last block of code






In [None]:
# load the saved models
from tensorflow import keras
model_location = "/content/drive/MyDrive/Saved Models/Task 3"

model_pred = keras.models.load_model(model_location)

# to view model architecture
model_pred.summary()

In [None]:
# functions required to generate text
def preprocess_test(sentences):
  sentences = sentences.lower() # convert text to lower case
  sentences = re.sub(r'http\S+', '', sentences) # remove hyperlinks
  sentences = html.unescape(sentences) # convert html to string
  sentences = contractions.fix(sentences) # convert contractions to full
  sentences = re.sub(r"[^\w\s]",'', sentences) # remove punctuations
  sentences = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentences) # remove single characters
  sentences = re.sub(r'\s+', ' ', sentences) # remove multiple spaces
  return sentences

#-----------------------------------------------------------------------------------------
def generate_text(input_text, num_words, model, max_len):
  max_len = 30
  for _ in range(num_words):
    # pre-processing the input text
    input_cleaned = preprocess_test(input_text)
    tokenizer.fit_on_texts(sentences_subset)
    token_list = tokenizer.texts_to_sequences([input_cleaned])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    
    # generate predictions
    predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
    
    # convert predictions to word
    output_word = ""
    for word,index in tokenizer.word_index.items():
      if index == predicted:
        output_word = word
        break
    input_text += " "+output_word
  return input_text

In [None]:
# take input from user
user_sentence = input("Enter sentence:")
Num_words = eval(input("Enter number of words to be predicted:"))
predicted_sentence = generate_text(user_sentence, num_words=Num_words, model=model_pred, max_len=30)
print(predicted_sentence)

Enter sentence:She might
Enter number of words to be predicted:16
She might even tell you that if you turn it in you will get passing grade for sure
