# Daily Challenge: Creating A Text Generator

## Import Libraries And Load Data:

In [1]:
import re
import requests
import numpy as np
import nltk
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Download the text from the given URL
url = "https://www.gutenberg.org/cache/epub/11/pg11.txt"
response = requests.get(url)
raw_text = response.text

## Preprocess Text Data:

In [4]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove non-alphabetic characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [5]:
# Split the text into sentences
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

In [6]:
# Preprocess the raw text
cleaned_text = preprocess_text(raw_text)
sentences = split_text_into_sentences(cleaned_text)

In [7]:
# Print the first 200 characters of the corpus
corpus = " ".join(sentences)
print("First 200 characters of the corpus:")
print(corpus[:200])

First 200 characters of the corpus:
The Project Gutenberg eBook of Alices Adventures in Wonderland This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restric


In [8]:
# Create vocabulary and calculate total_words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

print("\nTotal number of words in the vocabulary:", total_words)


Total number of words in the vocabulary: 3193


## Prepare Input And Output Data:

In [9]:
# Create n-gram sequences and pad the input data
input_sequence = []
seq_length = 50

for sentence in sentences:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        n_gram_sequence = tokenized_sentence[:i+1]
        input_sequence.append(n_gram_sequence)

max_sequence_length = max([len(seq) for seq in input_sequence])

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Pad sequences
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='pre')

# Display a sample of the padded input sequence
print("Sample of the padded input sequence:")
print(padded_input_sequence[:5])

Sample of the padded input sequence:
[[  0   0   0 ...   0   1  46]
 [  0   0   0 ...   1  46  47]
 [  0   0   0 ...  46  47 300]
 [  0   0   0 ...  47 300   5]
 [  0   0   0 ... 300   5 236]]


## Build The Neural Network Model:

In [11]:
# Define the model
model = Sequential()

# Add Embedding layer for text representation
embedding_dim = 22
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(30, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(30))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))

## Compile And Train The Model:

In [12]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Display the summary of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 29463, 22)         70246     
                                                                 
 lstm (LSTM)                 (None, 29463, 30)         6360      
                                                                 
 dropout (Dropout)           (None, 29463, 30)         0         
                                                                 
 lstm_1 (LSTM)               (None, 30)                7320      
                                                                 
 dropout_1 (Dropout)         (None, 30)                0         
                                                                 
 dense (Dense)               (None, 3193)              98983     
                                                                 
Total params: 182909 (714.49 KB)
Trainable params: 18290

In [None]:
# Train the model with EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(padded_input_sequence[:, :-1], padded_input_sequence[:, -1], epochs=20, validation_split=0.2, callbacks=[early_stopping])

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(padded_input_sequence[:, :-1], padded_input_sequence[:, -1])
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

In [None]:
# Create generate_text() function
def generate_text(seed_text, next_words, model, tokenizer, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted_probs = model.predict(token_list)
        predicted_word_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word[predicted_word_index]
        seed_text += " " + predicted_word
    return seed_text

In [None]:
# Example usage of generate_text()
seed_text = "Alice"
generated_text = generate_text(seed_text, next_words=10, model=model, tokenizer=tokenizer, max_sequence_length=max_sequence_length)
print("Generated Text:", generated_text)