<a href="https://colab.research.google.com/github/DrashtiDholariya/lstm_text_generator/blob/main/lstm_text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install tensorflow



In [4]:
!pip install -U numpy==2.0.2




In [27]:
import re
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input
from tensorflow.keras.utils import to_categorical



In [28]:
# Load shakespeare text data set
with open("/content/data/shakespeare.txt", "r", encoding="utf-8") as file:
    text = file.read().lower()

print("Total characters (raw):", len(text))
print("Sample text:\n", text[:300])

Total characters (raw): 5166764
Sample text:
 the project gutenberg ebook of the complete works of william shakespeare
    
this ebook is for the use of anyone anywhere in the united states and
most other parts of the world at no cost and with almost no restrictions
whatsoever. you may copy it, give it away or re-use it under the terms
of the p


In [29]:
# Clean text data
# Keep only lowercase letters and spaces
text = re.sub(r"[^a-z\s]", "", text)

text = text[:300000]

print("\nAfter cleaning:")
print("Total characters (cleaned):", len(text))
print(text[:300])


After cleaning:
Total characters (cleaned): 300000
the project gutenberg ebook of the complete works of william shakespeare
    
this ebook is for the use of anyone anywhere in the united states and
most other parts of the world at no cost and with almost no restrictions
whatsoever you may copy it give it away or reuse it under the terms
of the proj


In [30]:
# Character-level tokenization
# Create sorted list of unique characters
chars = sorted(list(set(text)))
print("\nUnique characters:", len(chars))

char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for c, i in char_to_idx.items()}

encoded_text = np.array([char_to_idx[c] for c in text])
print("Encoded sample:", encoded_text[:50])



Unique characters: 28
Encoded sample: [21  9  6  1 17 19 16 11  6  4 21  1  8 22 21  6 15  3  6 19  8  1  6  3
 16 16 12  1 16  7  1 21  9  6  1  4 16 14 17 13  6 21  6  1 24 16 19 12
 20  1]


In [31]:
# Create input-output sequences
SEQ_LENGTH = 40
X, y = [], []

for i in range(len(encoded_text) - SEQ_LENGTH):
    X.append(encoded_text[i:i + SEQ_LENGTH])
    y.append(encoded_text[i + SEQ_LENGTH])

X = np.array(X)
y = to_categorical(y, num_classes=len(chars))

print("\nX shape:", X.shape)
print("Y shape:", y.shape)


X shape: (299960, 40)
Y shape: (299960, 28)


In [32]:
# Build the LSTM model
model = Sequential([
    Input(shape=(SEQ_LENGTH,)),
    Embedding(input_dim=len(chars), output_dim=50),
    LSTM(128),
    Dense(len(chars), activation="softmax")
])

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam"
)

model.summary()

In [33]:
model.fit(
    X, y,
    batch_size=128,
    epochs=20,
    validation_split=0.1
)

Epoch 1/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - loss: 2.4091 - val_loss: 1.9772
Epoch 2/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 1.8325 - val_loss: 1.7743
Epoch 3/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 1.6789 - val_loss: 1.6707
Epoch 4/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - loss: 1.5823 - val_loss: 1.6162
Epoch 5/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - loss: 1.5261 - val_loss: 1.5759
Epoch 6/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 1.4908 - val_loss: 1.5554
Epoch 7/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 1.4569 - val_loss: 1.5351
Epoch 8/20
[1m2110/2110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 1.4295 - val_loss: 1.5186
Epoch 9/20
[1m2

<keras.src.callbacks.history.History at 0x78ecb4ce5d90>

In [34]:
def sample_with_temperature(preds, temperature=0.8):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

In [35]:
# Text generation
def generate_text(seed, length=300, temperature=0.8):
    seed = seed.lower()
    result = seed

    for _ in range(length):
        seq = [char_to_idx.get(c, 0) for c in seed]
        seq = pad_sequences([seq], maxlen=SEQ_LENGTH)

        prediction = model.predict(seq, verbose=0)[0]
        next_index = sample_with_temperature(prediction, temperature)
        next_char = idx_to_char[next_index]

        result += next_char
        seed = seed[1:] + next_char

    return result

user_seed = input("\nEnter starting text: ").lower()

if len(user_seed) < SEQ_LENGTH:
    user_seed = user_seed.rjust(SEQ_LENGTH)

gen_length = int(input("Enter number of characters to generate: "))

output = generate_text(user_seed, gen_length, temperature=0.8)

print("\nGenerated Text:\n")
print(output)

with open("sample_output.txt", "w", encoding="utf-8") as f:
    f.write("User Input:\n")
    f.write(user_seed.strip() + "\n\n")
    f.write("Generated Output:\n")
    f.write(output)

print("\nSaved to sample_output.txt successfully")





Enter starting text: shall i compare thee
Enter number of characters to generate: 500

GENERATED TEXT:

                    shall i compare thee a many and they may concribla and many great
for thou dost is not be of threat that dreach
the thine play deing me to be doth true still
scene xii
a passive a cloud not had


                    

let take a creasure issued i believe beare
that is cannot since see a room in the redument
whither doth heavens stands

bertram
his hearts and dispactions bade but i will be present better touse

first lord
o thank you are that body down to thee
the post that with my marry thou being more more
is it o

Saved successfully to sample_output.txt
