# Pre-processing

In [10]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import os

def preprocess_text(file_path, sequence_length=10):
    """ Preprocesses the text data for training. """
    
    # Step 1: Load the dataset
    df = pd.read_csv(file_path)
    
    # Combine all poetry into a single text
    all_poetry = " ".join(df["Poetry"].astype(str).tolist()).lower()

    # Step 2: Remove unnecessary characters
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", all_poetry)  # Keep only alphanumeric characters and spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()  # Remove extra spaces

    # Step 3: Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([cleaned_text])

    # Convert text to sequences of integers
    sequence_data = tokenizer.texts_to_sequences([cleaned_text])[0]

    # Vocabulary size
    vocab_size = len(tokenizer.word_index) + 1

    # Step 4: Generate sequences
    sequences = []
    for i in range(sequence_length, len(sequence_data)):
        seq = sequence_data[i-sequence_length:i+1]
        sequences.append(seq)

    sequences = np.array(sequences)

    # Step 5: Save processed data
    preprocessed_file = os.path.join("data", "preprocessed_sequences.csv")
    np.savetxt(preprocessed_file, sequences, delimiter=",")
    
    return preprocessed_file, vocab_size, tokenizer


Vocabulary size: 15414
Generated 156924 sequences.
✅ Preprocessed sequences saved to: preprocessed_sequences.csv
Shape of X: (156924, 10)
Shape of y: (156924, 15414)


# Spliting Data

In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the preprocessed sequence data (Ensure you have sequences saved from the first cell)
data_path = "preprocessed_sequences.csv"  # This file is now created in the first cell
sequences = np.loadtxt(data_path, delimiter=",")

# Split sequences into input (X) and output (y)
X, y = sequences[:, :-1], sequences[:, -1]

# Split data into Training (80%) and Temporary Set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the Temporary Set into Validation (10%) and Testing (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the datasets to CSV files
train_data = pd.DataFrame(np.column_stack([X_train, y_train]))
val_data = pd.DataFrame(np.column_stack([X_val, y_val]))
test_data = pd.DataFrame(np.column_stack([X_test, y_test]))

train_data.to_csv("train_data.csv", index=False, header=False)
val_data.to_csv("val_data.csv", index=False, header=False)
test_data.to_csv("test_data.csv", index=False, header=False)

print("✅ Data successfully split and saved!")
print("📂 Train Data: train_data.csv")
print("📂 Validation Data: val_data.csv")
print("📂 Test Data: test_data.csv")


✅ Data successfully split and saved!
📂 Train Data: train_data.csv
📂 Validation Data: val_data.csv
📂 Test Data: test_data.csv


In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# ✅ Step 1: Load the Preprocessed Data
train_data = pd.read_csv("train_data.csv", header=None)
val_data = pd.read_csv("val_data.csv", header=None)
test_data = pd.read_csv("test_data.csv", header=None)

# Extract input (X) and output (y)
X_train, y_train = train_data.iloc[:, :-1].values, train_data.iloc[:, -1].values
X_val, y_val = val_data.iloc[:, :-1].values, val_data.iloc[:, -1].values
X_test, y_test = test_data.iloc[:, :-1].values, test_data.iloc[:, -1].values

# Get vocabulary size from the dataset
vocab_size = np.max(X_train) + 1  # Largest token ID + 1

# ✅ Step 2: Build the LSTM Model
model = Sequential([
    Embedding(vocab_size, 100, input_length=X_train.shape[1]),  # Embedding layer
    LSTM(150, return_sequences=True),  # First LSTM layer
    LSTM(150),  # Second LSTM layer
    Dense(150, activation="relu"),  # Dense layer
    Dense(vocab_size, activation="softmax")  # Output layer
])

# Compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# ✅ Step 3: Train the Model
epochs = 50  # Adjust based on performance
history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=128,
    validation_data=(X_val, y_val),
    verbose=1
)

# ✅ Step 4: Save the Model
model.save("lstm_poetry_model.h5")
print("✅ Model trained and saved as 'lstm_poetry_model.h5'.")

# ✅ Step 5: Function to Generate Poetry
def generate_poetry(seed_text, next_words, model, tokenizer, sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')

        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)

        # Convert token ID back to word
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break

    return seed_text

# ✅ Step 6: Load Model & Generate Poetry
model = tf.keras.models.load_model("lstm_poetry_model.h5")

# Example: Generate Poetry
seed_text = "mujh se pehli si mohabbat"  # Provide some words
generated_poetry = generate_poetry(seed_text, 50, model, tokenizer, X_train.shape[1])
print("📝 Generated Poetry:\n", generated_poetry)


Epoch 1/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 122ms/step - accuracy: 0.0538 - loss: 7.1118 - val_accuracy: 0.0572 - val_loss: 6.6125
Epoch 2/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 122ms/step - accuracy: 0.0566 - loss: 6.4837 - val_accuracy: 0.0642 - val_loss: 6.6607
Epoch 3/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 121ms/step - accuracy: 0.0679 - loss: 6.3417 - val_accuracy: 0.0716 - val_loss: 6.6789
Epoch 4/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 120ms/step - accuracy: 0.0785 - loss: 6.1983 - val_accuracy: 0.0763 - val_loss: 6.7084
Epoch 5/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 125ms/step - accuracy: 0.0859 - loss: 6.0560 - val_accuracy: 0.0797 - val_loss: 6.7546
Epoch 6/50
[1m981/981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 121ms/step - accuracy: 0.0915 - loss: 5.9141 - val_accuracy: 0.0793 - val_loss: 6.8138
Epoc



✅ Model trained and saved as 'lstm_poetry_model.h5'.


In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# ✅ Step 1: Load the Trained Model
model = tf.keras.models.load_model("lstm_poetry_model.h5")

# ✅ Fix the Warning: Recompile the Model
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# ✅ Step 2: Load the Tokenizer
data_path = "Roman-Urdu-Poetry.csv"  # Ensure this is the same dataset used before
df = pd.read_csv(data_path)

# Combine all poetry to recreate the tokenizer
all_poetry = " ".join(df["Poetry"].astype(str).tolist()).lower()

# Tokenize again (ensure consistency with training)
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([all_poetry])

# Get sequence length from training
sequence_length = 10  # Ensure this matches the training sequence length

# ✅ Step 3: Function to Generate Poetry
def generate_poetry(seed_text, next_words, model, tokenizer, sequence_length):
    for _ in range(next_words):
        # Convert seed text to numerical tokens
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        
        # Pad the sequence to match input length
        token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')

        # Predict the next word
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)

        # Convert token ID back to word
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                seed_text += " " + word
                break

    return seed_text

# ✅ Step 4: Generate Poetry
seed_text = "hasina daniya ky laab"  # Provide some starting words
generated_poetry = generate_poetry(seed_text, 50, model, tokenizer, sequence_length)

print("📝 Generated Poetry:\n", generated_poetry)




📝 Generated Poetry:
 hasina daniya ky laab ġhazab tark toḍ ī le kar kī kaifiyateñ kar koī manzil mujh hai e nahīñ kyā gyārahvīñ maiñ to ke taḍpā koshish meñ jashn ke suḳhan hunar ke ga jasta hai sau haiñ kis e vo kā diyā e vo pākīza ke baaqī e aur jo hai sanjān meñ to
