# **Language Model**

In [1]:
# Importing libraries
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Embedding, GRU
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file, to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import random
import sys
import io
import requests
import re
import string

In [2]:
directory = os.getcwd()

print(directory)


c:\Users\Aggelos\Desktop\Projects\TextModel


In [3]:
os.chdir(r"c:\Users\Aggelos\Desktop\Projects\TextModel")

# Data preparation and cleaning

In [4]:
# Requesting the text file
req = requests.get("https://www.gutenberg.org/files/61262/61262-0.txt")

In [None]:
# Viewing the text
# The text provided needs cleaning for the model to run correctly.
req.text

In [6]:
source_text = req.text

In [7]:
# Creating a function that is going to clean the inputted text
def character_remove(text):
    text = text.split('\n')# Split the lines using the newLine character
    text = text[104:] # The book begins from line 104 the introductions is removed
    text = " ".join(text) # Making the data continuous remove the newLine character
    text = text.replace('\r', '')#Removing the return character
    text = re.sub(r'[^\x00-\x7f]', r'', text) #Removing the extra characters
    text = text.translate(str.maketrans('', '', string.punctuation))#Replacing special characters
    return text

In [8]:
text = character_remove(source_text)

In [9]:
#Collecting all words from the file and then collecting all the unique words from the corpus
corpus = text.split(" ")
corpus = [x for x in corpus if x != ""]
dictionary = list(set(corpus))

In [10]:
# Subsetting sentences of 40 words each
sent_len = 40+1 # The first 40 words we created are going to be used as feature while the last world will be predicted
steps = 1 # Setting step size of the words
all_sentences = []
for i in range(sent_len, len(corpus)):
  sentence = corpus[i - sent_len: i] # sliding window, dividing the whole text into multiple strings, each of length 31...
  sentence = ' '.join(sentence)
  all_sentences.append(sentence)

In [None]:
all_sentences[:10]

In [12]:
#Transforming the words to numbers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences)
seq = tokenizer.texts_to_sequences(all_sentences)

In [13]:
# Converting one dimentional list to multidimensional container of items of the same type and size
seq = np.vstack(seq)

In [14]:
# Using first 40 columns of each rows as features and 41st as target variable...
X = seq[:, :-1]
y = seq[:, -1]

In [15]:
# One hot encoding the the test variable
y = to_categorical(y) 

# LSTM Model

In [28]:
# Sequential LSTM model 
model = Sequential()

# input is the length of the dictionary output_dim is 50, and input length is 41
model.add(Embedding(len(tokenizer.word_index) + 1, 50, input_length = X.shape[1])) 

# The LSTM units are 64 and return_sequences are set to True
model.add(LSTM(64, return_sequences=True))

model.add(LSTM(64))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

In [24]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 40, 50)            331400    
                                                                 
 lstm_2 (LSTM)               (None, 40, 64)            29440     
                                                                 
 lstm_3 (LSTM)               (None, 64)                33024     
                                                                 
 dense_2 (Dense)             (None, 128)               8320      
                                                                 
 dense_3 (Dense)             (None, 6628)              855012    
                                                                 
Total params: 1,257,196
Trainable params: 1,257,196
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compiling the model with adam optimizer
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
lstm_history = model.fit(X, y, batch_size = 64, epochs=500)

In [30]:
def text_generator(model, tokenizer, seq_len, feature_text, num_words):
  text = []
  for i in range(num_words):
    token = tokenizer.texts_to_sequences([feature_text])[0]
    token = pad_sequences([token], maxlen = seq_len, truncating='pre')
    # y_pred = model.predict_classes(token)
    y_pred = model.predict(token) 
    y_pred = np.argmax(y_pred, axis=1)

    pred_word = ''
    for word, idx in tokenizer.word_index.items():
      if idx == y_pred:
        pred_word = word
        break
    feature_text += " "+ pred_word
    text.append(pred_word)

  return " ".join(text)

In [31]:
model.save('model.h5')

# Testing

In [32]:
#LoaD model from the file
lstm_model = load_model('lstm_model_v2.h5')

In [33]:
num_of_words = 50 # number of words to be generated...

# input text...
text = """This is 50 word sentence to test some text generation from the writings of agatha Christie how exciting is!"""
        
text_generator(lstm_model, tokenizer, X.shape[1], text, num_of_words)

ValueError: in user code:

    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Aggelos\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 30), found shape=(None, 40)
