<a href="https://colab.research.google.com/github/AWTT237/ASSIGNMENT2/blob/main/ASS13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Dataset Download
import requests

url = "https://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

with open("alice.txt", "w", encoding="utf-8") as f:
    f.write(text)


In [3]:
# Dataset Preprocessing
start_idx = text.find("*** START OF THIS PROJECT GUTENBERG EBOOK")
end_idx = text.find("*** END OF THIS PROJECT GUTENBERG EBOOK")
clean_text = text[start_idx:end_idx]


In [4]:
# Lowercasing
lean_text = clean_text.lower()


In [5]:
# Remove unwanted characters
import re
clean_text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", clean_text)


In [11]:
# Tokenization
tokens = clean_text.split()  # word-level
tokens = list(clean_text)    # character-level


In [12]:
# Save Preprocessed Dataset
with open("alice_clean.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(tokens))


In [13]:
# data preparation
import numpy as np
import requests
import re

# ------------------------------
# Download and clean text
# ------------------------------
url = "https://www.gutenberg.org/files/11/11-0.txt"  # Alice in Wonderland
response = requests.get(url)
text = response.text

# Remove Gutenberg header/footer
start_idx = text.find("*** START OF THIS PROJECT GUTENBERG EBOOK")
end_idx = text.find("*** END OF THIS PROJECT GUTENBERG EBOOK")
if start_idx != -1 and end_idx != -1:
    text = text[start_idx:end_idx]

# Lowercase and remove unwanted chars
text = text.lower()
text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)

print("Total characters in text:", len(text))


Total characters in text: 140246


In [14]:
# Sequence Generation
# Character-level tokenization
chars = sorted(list(set(text)))
char_to_idx = {c:i for i,c in enumerate(chars)}
idx_to_char = {i:c for i,c in enumerate(chars)}

# Parameters
seq_length = 50
step = 1

# Create sequences and next characters
sentences = []
next_chars = []

for i in range(0, len(text)-seq_length, step):
    sentences.append(text[i:i+seq_length])
    next_chars.append(text[i+seq_length])

print("Total sequences:", len(sentences))

# Vectorize
X = np.zeros((len(sentences), seq_length, len(chars)), dtype=np.bool_)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool_)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_to_idx[char]] = 1
    y[i, char_to_idx[next_chars[i]]] = 1


Total sequences: 140196


In [15]:
# Model Definition
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()


In [16]:
# Adjust Batch Size Automatically
batch_size = min(128, len(X))  # Avoid batch bigger than dataset
epochs = 20


In [17]:
# Model Training
model.fit(X, y, batch_size=batch_size, epochs=epochs)


Epoch 1/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 133ms/step - loss: 2.7073
Epoch 2/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 128ms/step - loss: 2.0636
Epoch 3/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 129ms/step - loss: 1.8908
Epoch 4/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 127ms/step - loss: 1.7759
Epoch 5/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 126ms/step - loss: 1.6923
Epoch 6/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 126ms/step - loss: 1.6265
Epoch 7/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 126ms/step - loss: 1.5630
Epoch 8/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 127ms/step - loss: 1.5242
Epoch 9/20
[1m1096/1096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 126ms/step - loss: 1.4788
Epoch 10/20
[1m1096/1096[0m [32m━━

<keras.src.callbacks.history.History at 0x7871da859010>

In [18]:
# Text Generation Function
def generate_text(seed, length=400):
    generated = seed
    seed = seed[-seq_length:]  # ensure correct input length
    for _ in range(length):
        x_pred = np.zeros((1, seq_length, len(chars)))
        for t, char in enumerate(seed):
            x_pred[0, t, char_to_idx[char]] = 1
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = np.argmax(preds)
        next_char = idx_to_char[next_index]
        generated += next_char
        seed = seed[1:] + next_char
    return generated

# Example usage
seed_text = "alice was beginning to get very tired of sitting"
print(generate_text(seed_text))


alice was beginning to get very tired of sittingi tn seehooraiaaaaaaaaa,yite eaio elelio teeleso o eele io aioo eeleli,oteelio elli ooo elradi teupese eli o ealoo o elra,i aooo ooo,ooo,yi ao eliodoo o aea aoooltooo oo oo oo,yooo oo oo o,oo ooofoo,yoo ooo oo oo,yi ao a o aea elei tneae elli ooo elradallio o yo oelso,ooo aoo o elraioooo,a oo ee otolso o y oole o,a io aoo ee elra o eleli tnisisoootooo oo oo o,oo ooofoo,yoo ooo oo oo oo,yi oo o o o


In [19]:
# Import necessary libraries
import numpy as np

# Assume the trained model and char mappings are already loaded:
# model, char_to_idx, idx_to_char, seq_length

def generate_text(seed, length=400):
    """
    Generate text from a seed string using the trained LSTM model.
    """
    generated = seed
    seed = seed[-seq_length:]  # ensure correct input length
    for _ in range(length):
        x_pred = np.zeros((1, seq_length, len(char_to_idx)))
        for t, char in enumerate(seed):
            if char in char_to_idx:
                x_pred[0, t, char_to_idx[char]] = 1
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = np.argmax(preds)
        next_char = idx_to_char[next_index]
        generated += next_char
        seed = seed[1:] + next_char
    return generated

# ------------------------------
# Application Simulation
# ------------------------------
print("=== Creative Content Generator ===")
user_seed = input("Enter a seed sentence to start generating text: ")

generated_output = generate_text(user_seed, length=300)
print("\n--- Generated Content ---\n")
print(generated_output)


=== Creative Content Generator ===
Enter a seed sentence to start generating text: handsome guys

--- Generated Content ---

handsome guysaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
