In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Required Packages

In [2]:
# Required package:
import pandas as pd
import gensim
import re
import numpy as np
import tensorflow as tf

# Required imports for Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from tensorflow.keras.utils import to_categorical

# Required imports:
from tensorflow.keras import models
from tensorflow.keras import layers

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Importing the data

In [3]:
filePath = "/content/drive/MyDrive/61262-0.txt"

# Read the data using the correct encoding:
with open(filePath, "r", encoding="utf-8") as data:
    text_df = data.read()

## Preprocessing the text

In [4]:
text_df = re.sub(r"[^A-Za-z\s.']", ' ', text_df)  # Removing characters other than alphabets, space, period, and apostrophe
text_df = text_df.lower()  # Converting to lowercase
sentences = sent_tokenize(text_df)  # Tokenizing into sentences

clean_text = [
    [word for word in sentence.split() ] #Splitting sentences into words
    for sentence in sentences
]

print(f'There are {len(clean_text)} sentences in the text')

There are 3823 sentences in the text


In [5]:
# Initializing Tokenizer with lower = False (keeps the case)
tokeniser = Tokenizer(lower=False)

# Fitting the Tokeniser to the text in the training data:
tokeniser.fit_on_texts(clean_text)

# Creating a pickle to store the tokenizer to be later used for testing
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokeniser, handle)

# Convert text into sequences of word indices
clean_text = tokeniser.texts_to_sequences(clean_text)

input_text = [] # -> Stores input sequence
output_text = [] # -> Stores corresponding next words

for sequence in clean_text:
  for i in range(1, len(sequence)):
    input_text.append(sequence[:i]) # Input all words upto the current index
    output_text.append(sequence[i]) # Output next word

# Finding the maximum length to pad all sequences to this length
max_length = max(len(sequence) for sequence in input_text)

# Converting sequences to the same length using padding and truncating
input_text = pad_sequences(input_text, maxlen=max_length, padding='post', truncating='post')

# Getting the total number of words, needed for the embedding layer size, +1 is added because of zero padding adding one more index in the vocabulary
num_words = len(tokeniser.word_index) + 1

print(f'The  Training Set includes {input_text.shape[0]} sequences of {input_text.shape[1]} tokens.')
print(f'There are {num_words} words in our vocabulary!')

The  Training Set includes 50227 sequences of 157 tokens.
There are 7005 words in our vocabulary!


In [6]:
# Parameters to set
embedding_dim = 32 # -> Size of word embedding vector
output_dim = 64 # -> Number of LSTM units per layer

# Function to create the model:
def create_model(embedding_dim, lstm_units, vocab_size, max_length):

  model = models.Sequential()
  model.add(layers.Embedding(num_words, embedding_dim, input_length=max_length)) # -> converting word indices to vectors of fixed size
  model.add(layers.LSTM(output_dim, return_sequences= True, dropout= 0.2)) # -> Adding dropouts for regularization
  model.add(layers.LSTM(output_dim, return_sequences= True, dropout= 0.3))
  model.add(layers.LSTM(output_dim))
  model.add(layers.LayerNormalization())
  model.add(layers.Dense(num_words, activation='softmax')) # -> using softmax because it is text generation and it requires probability


  return model

print('Done!')

Done!


In [7]:
# Building the model:
model = create_model(embedding_dim, output_dim,num_words, max_length)
model.summary()



In [8]:
# The compiler preparing the model for training:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [9]:
# Training parameters
epochs = 100
batch_size = 32

output_text = np.array(output_text)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
model.fit(input_text, output_text, epochs=epochs, batch_size=batch_size, verbose=1, callbacks=[early_stopping])

# Save trained model
model.save('/content/drive/MyDrive/deep_learning/lstm_model_text_generator.keras')


Epoch 1/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.0548 - loss: 7.1681
Epoch 2/100
[1m  10/1570[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m27s[0m 17ms/step - accuracy: 0.0596 - loss: 6.8578

  current = self.get_monitor_value(logs)


[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0549 - loss: 6.7746
Epoch 3/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0562 - loss: 6.7345
Epoch 4/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0557 - loss: 6.7218
Epoch 5/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0572 - loss: 6.7232
Epoch 6/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0571 - loss: 6.7057
Epoch 7/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - accuracy: 0.0561 - loss: 6.7116
Epoch 8/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0564 - loss: 6.7004
Epoch 9/100
[1m1570/1570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.0565 - loss: 6.7060
Epoch 10/100

In [10]:
from tensorflow.keras.models import load_model

# Loading the trained model
model = load_model('/content/drive/MyDrive/deep_learning/lstm_model_text_generator.keras')



In [11]:
def generate_text(model, tokenizer, seed_text, max_length, num_words=50, temperature=1.0 ):
    for _ in range(num_words):
        # Converting seed text to sequence
        sequence = tokenizer.texts_to_sequences([seed_text])
        sequence = pad_sequences(sequence, maxlen=max_length, padding='pre')

        # Predicting next word probabilities
        predicted_probs = model.predict(sequence, verbose=0)[0]

        # Applying temperature scaling
        predicted_probs = np.asarray(predicted_probs).astype("float64")
        predicted_probs = np.log(predicted_probs + 1e-9) / temperature  # Avoiding log(0)
        exp_preds = np.exp(predicted_probs)
        predicted_probs = exp_preds / np.sum(exp_preds)

        # Sampling from the probability distribution
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)

        # Converting index to word
        predicted_word = tokenizer.index_word.get(predicted_index, "")

        # To ensure predicted word is a string
        if not isinstance(predicted_word, str):
            continue  # Skip if it's not a valid string

        # Skipping numbers or invalid words
        if not predicted_word.isalpha():
            continue

        # Appending the predicted word
        seed_text += " " + predicted_word

    return seed_text


In [12]:
# Loading the tokenizer

with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)


In [13]:
# Define a seed text
text_prompt = "There was a murder in the building"

# Generate text
generated_text = generate_text(model, tokenizer, text_prompt, max_length=100)

# Print generated text
print(generated_text)


There was a murder in the building point about your not too that s was temporary sold heeds of poirot i packet our look was drawer i or the which fianc influence concerning street the my table house match number from of hastings out very arm do to that seen the sad but fibre
