In [43]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding, Dropout, LayerNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import TopKCategoricalAccuracy

import re

In [44]:
# Load the content of the text file
with open('gift.txt', 'r', encoding='utf-8') as file:
    content = file.read().lower()

# Remove punctuation and other unwanted characters
cleaned_content = re.sub(r'[^\w\s]', '', content)

# Split content into sentences and filter out empty lines
sentence_list = cleaned_content.split('\n')
non_empty_sentences = [sentence for sentence in sentence_list if sentence.strip()]


In [45]:
# Initialize Tokenizer and fit it on the non-empty sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(non_empty_sentences)

# Convert sentences into sequences of tokens
token_sequences = []
for sentence in non_empty_sentences:
    tokenized_words = tokenizer.texts_to_sequences([sentence])[0]
    for length in range(1, len(tokenized_words)):
        token_sequences.append(tokenized_words[:length + 1])

# Pad the token sequences to make them equal in length
max_length = max(len(seq) for seq in token_sequences)
padded_sequences = pad_sequences(token_sequences, maxlen=max_length, padding='pre')

# Split padded sequences into input features (X) and target labels (y)
padded_sequences = np.array(padded_sequences)
X, y = padded_sequences[:, :-1], padded_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)


In [46]:
# Define the vocabulary size and dimensions for embeddings
vocabulary_size = len(tokenizer.word_index) + 1  # Adding 1 for padding purposes
embedding_dimensions = 50
gru_hidden_units = 128

# Construct the neural network model
text_generation_model = Sequential([
    Embedding(vocabulary_size, embedding_dimensions, input_length=max_length - 1),
    GRU(gru_hidden_units, return_sequences=True),
    GRU(gru_hidden_units),
    Dense(vocabulary_size, activation='softmax')
])

# Compile the model with optimizer, loss, and performance metrics
text_generation_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy' ,TopKCategoricalAccuracy(k=5)])


In [48]:
# Train the text generation model
num_epochs = 75
training_history = text_generation_model.fit(X, y, batch_size=64, epochs=num_epochs)


Epoch 1/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.0589 - loss: 5.9482 - top_k_categorical_accuracy: 0.1641
Epoch 2/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.0610 - loss: 5.9043 - top_k_categorical_accuracy: 0.1799
Epoch 3/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.0708 - loss: 5.8959 - top_k_categorical_accuracy: 0.1859
Epoch 4/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.0712 - loss: 5.8255 - top_k_categorical_accuracy: 0.1948
Epoch 5/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.1174 - loss: 5.4872 - top_k_categorical_accuracy: 0.2324
Epoch 6/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.1330 - loss: 5.2080 - top_k_categorical_accuracy: 0.2672
Epoch 7/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [55]:
# Get the final values of the metrics from the training history
train_loss = training_history.history['loss'][-1]
train_accuracy = training_history.history['accuracy'][-1]
train_top5_accuracy = training_history.history['top_k_categorical_accuracy'][-1]  # Top-5 accuracy

# Print final metrics
print("\n--- Final Metrics ---")
print(f"Final Training Loss: {train_loss:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Top-5 Accuracy: {train_top5_accuracy:.4f}")



--- Final Metrics ---
Final Training Loss: 0.2031
Training Accuracy: 0.9499
Training Top-5 Accuracy: 0.9894


In [56]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from random import randint

# Function to generate a sequence using a language model
def generate_sequence(language_model, tokenizer, sequence_length, initial_text, num_words):
    generated_words = list()
    current_text = initial_text
    # Generate the specified number of words
    for _ in range(num_words):
        # Encode the current text as integers
        encoded_text = tokenizer.texts_to_sequences([current_text])[0]
        # Truncate the encoded text to a fixed sequence length
        truncated_text = pad_sequences([encoded_text], maxlen=sequence_length, truncating='pre')
        # Predict the next word probabilities
        predicted_word_index = np.argmax(language_model.predict(truncated_text, verbose=0), axis=-1)
        # Map the predicted word index to the corresponding word
        next_word = tokenizer.index_word[predicted_word_index[0]] if predicted_word_index[0] in tokenizer.index_word else ''
        # Append the predicted word to the current text
        current_text += ' ' + next_word
        generated_words.append(next_word)
    return ' '.join(generated_words)

# Assuming 'non_empty_sentences' is a list of seed texts and generating 10 sequences
for _ in range(10):
    random_seed_text = non_empty_sentences[randint(0, len(non_empty_sentences) - 1)]
    print("SEED TEXT:", random_seed_text)
    generated_text = generate_sequence(text_generation_model, tokenizer, max_length - 1, random_seed_text, 50)
    print("GENERATED TEXT:", generated_text + '\n')


SEED TEXT: metaphor she was ransacking the stores for jims present
GENERATED TEXT: a present and stood still while all all all it for her face dillingham had been saving ebooks with the chops on went that a mathematician or a wit would cat and valuethe ut 84116 friendsa friendsa dear friendsa 801 friendsa mammoth friendsa 84116 friendsa 801 dear friendsa friendsa friendsa

SEED TEXT: out of his trance jim seemed quickly to wake he enfolded his della
GENERATED TEXT: had been 5961887 email friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa friendsa

SEED TEXT: release date january 1 2005 ebook 7256
GENERATED TEXT: if you do not charge a lit

In [59]:
# Define vocabulary size and dimensions for embedding
vocabulary_size = len(tokenizer.word_index) + 1
embedding_dimensions = 300  # FastText embedding dimension size
gru_hidden_units = 128

# Load FastText embeddings into a dictionary
fasttext_embeddings = {}
with open('wiki-news-300d-1M.vec', 'r', encoding='utf-8') as file:
    # Skip the first line if it contains the header (e.g., number of words and dimensions)
    next(file)
    for line in file:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        fasttext_embeddings[word] = coefficients

# Create the embedding matrix using FastText embeddings
embedding_matrix = np.zeros((vocabulary_size, embedding_dimensions))
for word, index in tokenizer.word_index.items():
    embedding_vector = fasttext_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Build the model using pre-trained FastText embeddings
text_model = Sequential([
    Embedding(input_dim=vocabulary_size, output_dim=embedding_dimensions, 
              input_length=max_length - 1, 
              weights=[embedding_matrix], trainable=False),
    GRU(gru_hidden_units, return_sequences=True),
    GRU(gru_hidden_units),
    Dense(vocabulary_size, activation='softmax')
])

# Compile the model with the optimizer, loss function, and accuracy metric
text_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', TopKCategoricalAccuracy(k=5)])


In [60]:
# Train the model
num_epochs = 50
history = text_model.fit(X, y, batch_size=64, epochs=num_epochs)

Epoch 1/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.0467 - loss: 6.7254 - top_k_categorical_accuracy: 0.1426
Epoch 2/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.0596 - loss: 5.9877 - top_k_categorical_accuracy: 0.1804
Epoch 3/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.0641 - loss: 5.9528 - top_k_categorical_accuracy: 0.1833
Epoch 4/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.0514 - loss: 5.9422 - top_k_categorical_accuracy: 0.1812
Epoch 5/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.0804 - loss: 5.7950 - top_k_categorical_accuracy: 0.1945
Epoch 6/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.1069 - loss: 5.6368 - top_k_categorical_accuracy: 0.2137
Epoch 7/75
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [61]:
# Generate and display text sequences using a seed text
for _ in range(10):
    # Select a random seed text from the non-empty sentences
    random_seed_text = non_empty_sentences[randint(0, len(non_empty_sentences) - 1)]
    print("SEED TEXT:", random_seed_text)
    
    # Generate a sequence using the language model
    generated_text = generate_sequence(text_model, tokenizer, max_length - 1, random_seed_text, 50)
    print("GENERATED TEXT:", generated_text + '\n')


SEED TEXT: dillingham young came home and reached his flat above he was called
GENERATED TEXT: the solicitation of the united states we do not agree to the terms of this agreement for keeping the work as long as set forth in the terms of the full project gutenberg license must appear created the new work in the united other states states and most other parts

SEED TEXT: she found it at last it surely had been made for jim and no one else
GENERATED TEXT: in the user to return or destroy all of this work in the person of the work in the united states we do not agree to the terms of the full project gutenberg license must appear about the foundation the project gutenberg mission of promoting the chops on account in

SEED TEXT: country other than the united states
GENERATED TEXT: and most other parts of the project gutenberg license when any particular paper and distributing a project gutenberg work any works appears in the foundation the project gutenberg electronic for paper walking a gr