# Original Code

In [1]:
"""
This Python script builds and trains a Recurrent Neural Network (RNN) to predict the next word in a given text sequence.
The model uses an Embedding layer and a SimpleRNN layer. It tokenizes the input text, creates sequences for training,
and uses these sequences to train the model.
"""
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

# Texto de ejemplo para entrenamiento
text = "In a far away land, there was a small village with great people. The village was surrounded by mountains and rivers."

# Tokenizar el texto
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]

vocab_size = len(tokenizer.word_index) + 1
sequence_length = 5

# Crear dataset de entrenamiento
X = []
y = []

for i in range(len(sequences) - sequence_length):
    X.append(sequences[i:i + sequence_length])
    y.append(sequences[i + sequence_length])

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

# Crear y entrenar el modelo RNN
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=sequence_length))
model.add(SimpleRNN(50, activation='tanh'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=32)

# Guardar el modelo entrenado
model.save('text_prediction_rnn.h5')

# Generar texto utilizando el modelo entrenado
def generate_text(model, tokenizer, seed_text, num_words):
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        if len(encoded) < sequence_length:
            encoded = np.pad(encoded, (sequence_length - len(encoded), 0), 'constant')
        else:
            encoded = np.array(encoded[-sequence_length:])
        encoded = encoded.reshape(1, sequence_length)

        y_pred = np.argmax(model.predict(encoded), axis=-1)
        word = tokenizer.index_word[y_pred[0]]
        seed_text += ' ' + word
    return seed_text

# Generar nuevo texto basado en un texto inicial
seed_text = "there was far"
generated_text = generate_text(model, tokenizer, seed_text, 10)
print(generated_text)

Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0625 - loss: 2.9407
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0625 - loss: 2.9253
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0625 - loss: 2.9099
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1250 - loss: 2.8942
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1250 - loss: 2.8782
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2500 - loss: 2.8618
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3750 - loss: 2.8448
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.3750 - loss: 2.8272
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 344ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
there was far was was was was was was was was was was


# Modified Code

In [2]:
from tensorflow.keras.layers import LSTM, Dropout

# Example text for training
text = "In a far away land, there was a small village with great people. The village was surrounded by mountains and rivers."

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
sequences = tokenizer.texts_to_sequences([text])[0]

vocab_size = len(tokenizer.word_index) + 1
sequence_length = 5  # Can experiment with different values like 10 or 15

# Create training dataset
X = []
y = []

for i in range(len(sequences) - sequence_length):
    X.append(sequences[i:i + sequence_length])
    y.append(sequences[i + sequence_length])

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

# Create and train the modified RNN model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=sequence_length))  # Increased embedding size
model.add(Dropout(0.2))  # Dropout after embedding layer
model.add(LSTM(50, activation='tanh'))  # Replace SimpleRNN with LSTM
model.add(Dropout(0.2))  # Dropout after LSTM layer
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=32)

# Save the trained model
model.save('text_prediction_rnn_modified.h5')

# Generate text using the trained model
def generate_text(model, tokenizer, seed_text, num_words):
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        if len(encoded) < sequence_length:
            encoded = np.pad(encoded, (sequence_length - len(encoded), 0), 'constant')
        else:
            encoded = np.array(encoded[-sequence_length:])
        encoded = encoded.reshape(1, sequence_length)

        y_pred = np.argmax(model.predict(encoded), axis=-1)
        word = tokenizer.index_word.get(y_pred[0], '')
        seed_text += ' ' + word
    return seed_text

# Generate new text based on initial text
seed_text = "there was far"
generated_text = generate_text(model, tokenizer, seed_text, 10)
print(generated_text)

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0625 - loss: 2.9454
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.0625 - loss: 2.9422
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0000e+00 - loss: 2.9409
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1250 - loss: 2.9353
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.0000e+00 - loss: 2.9377
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1875 - loss: 2.9308
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.0625 - loss: 2.9275
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.3750 - loss: 2.9235
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 239ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
there was far was was small village village village the village village was


**Explanation of Changes:**

* LSTM Layer Replacement: LSTM should help the model generate more contextually coherent text over a sequence, handling dependencies between distant words more effectively than SimpleRNN.
* Increased Embedding Dimension: This should enrich word representations, allowing the model to better understand relationships between words.
* Dropout: May improve generalization, particularly beneficial if training data is limited, which can prevent the model from overfitting.

**Observations on the Original Code:**

The generated text ("there was far was was was was was...") indicates that the SimpleRNN struggles to maintain context and introduces significant repetition. This is common for SimpleRNN layers because they tend to lose context over longer sequences, causing the model to repeat the same words.

**Observations on the Modified Code:**

The modified model generates text with slightly more variety ("there was far was was small village village village..."). Although there’s still repetition, the LSTM and increased embedding size help capture more meaningful word associations like "small" and "village." This improvement demonstrates that the model is learning some contextual relationships, though it’s still prone to repetition.