In [21]:
import pandas as pd

# Read only the "Poetry" column from the CSV file
df = pd.read_csv("Roman-Urdu-Poetry.csv", usecols=["Poetry"])

# Display the first few rows
print(df.head())


                                              Poetry
0  aañkh se duur na ho dil se utar jā.egā \nvaqt ...
1  āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2  ab aur kyā kisī se marāsim baḌhā.eñ ham \nye b...
3  ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4  ab ke tajdīd-e-vafā kā nahīñ imkāñ jānāñ \nyaa...


In [22]:
import re

# Function to remove Urdu punctuation and diacritics
def remove_urdu_punctuation_and_diacritics(text):
    if isinstance(text, str):
        # Remove Urdu punctuation and diacritics
        text = re.sub(r'[،۔؟!"“”‘’؛\.-]', '', text)  # Removes punctuation
        return text
    return text

# Clean specific columns
columns_to_clean = ['Poetry']

for column in columns_to_clean:
    df[column] = df[column].apply(remove_urdu_punctuation_and_diacritics)

df.to_csv("Cleaned_Roman-Urdu-Poetry.csv", index=False)


print(df.head())


                                              Poetry
0  aañkh se duur na ho dil se utar jāegā \nvaqt k...
1  āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2  ab aur kyā kisī se marāsim baḌhāeñ ham \nye bh...
3  ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4  ab ke tajdīdevafā kā nahīñ imkāñ jānāñ \nyaad ...


In [23]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Convert poetry to a list
poetry_lines = df["Poetry"].tolist()

# Tokenizer to convert words into numerical form
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poetry_lines)

# Convert text into sequences
sequences = tokenizer.texts_to_sequences(poetry_lines)

# Define input (X) and output (Y) sequences
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i+1])

# Padding sequences
max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding="pre")

# Split into X (input) and Y (output)
X, Y = input_sequences[:, :-1], input_sequences[:, -1]

# Convert Y into one-hot encoding
Y = np.array(Y)  # Keep Y as integer labels

print("Data prepared for LSTM training!")


Data prepared for LSTM training!


In [24]:
from sklearn.model_selection import train_test_split
import numpy as np

# Split data into Training (80%) and Temporary Set (20%)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)

# Split the Temporary Set into Validation (10%) and Testing (10%)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

# Print dataset shapes
print("Data successfully split!")
print(f"Training Data: X_train = {X_train.shape}, Y_train = {Y_train.shape}")
print(f"Validation Data: X_val = {X_val.shape}, Y_val = {Y_val.shape}")
print(f"Test Data: X_test = {X_test.shape}, Y_test = {Y_test.shape}")


Data successfully split!
Training Data: X_train = (124496, 433), Y_train = (124496,)
Validation Data: X_val = (15562, 433), Y_val = (15562,)
Test Data: X_test = (15563, 433), Y_test = (15563,)


In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Step 1: Get Vocabulary Size
vocab_size = np.max(X_train) + 1  # Largest token ID + 1

# Step 2: Define the LSTM Model
model = Sequential([
    Embedding(vocab_size, 100, input_length=X_train.shape[1]),  # Embedding layer
    LSTM(150, return_sequences=True),  # First LSTM layer
    LSTM(150),  # Second LSTM layer
    Dense(150, activation="relu"),  # Dense layer
    Dense(vocab_size, activation="softmax")  # Output layer
])

# Step 3: Compile the Model
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

# Step 4: Train the Model
epochs = 55  # Adjust based on performance
history = model.fit(
    X_train, Y_train,
    epochs=epochs,
    batch_size=128,
    validation_data=(X_val, Y_val),
    verbose=1
)

# Step 5: Save the Model
model.save("lstm_poetry_model.h5")
print("Model trained and saved as 'lstm_poetry_model.h5'.")




Epoch 1/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 86ms/step - accuracy: 0.0415 - loss: 7.3163 - val_accuracy: 0.0419 - val_loss: 6.8869
Epoch 2/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 85ms/step - accuracy: 0.0445 - loss: 6.6624 - val_accuracy: 0.0459 - val_loss: 6.9709
Epoch 3/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 87ms/step - accuracy: 0.0518 - loss: 6.5301 - val_accuracy: 0.0547 - val_loss: 6.9744
Epoch 4/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 88ms/step - accuracy: 0.0647 - loss: 6.3574 - val_accuracy: 0.0645 - val_loss: 7.0290
Epoch 5/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 85ms/step - accuracy: 0.0753 - loss: 6.1868 - val_accuracy: 0.0659 - val_loss: 7.0826
Epoch 6/55
[1m973/973[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 85ms/step - accuracy: 0.0859 - loss: 6.0295 - val_accuracy: 0.0700 - val_loss: 7.1175
Epoch 7/55
[



Model trained and saved as 'lstm_poetry_model.h5'.


In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

# Step 1: Load the Trained Model
model = tf.keras.models.load_model("lstm_poetry_model.h5")

# Fix the Warning: Recompile the Model
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Step 2: Load the Tokenizer
data_path = "Roman-Urdu-Poetry.csv"  # Ensure this is the same dataset used before
df = pd.read_csv(data_path)

# Combine all poetry to recreate the tokenizer
all_poetry = " ".join(df["Poetry"].astype(str).tolist()).lower()

# Tokenize again (ensure consistency with training)
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([all_poetry])

# Get sequence length from training
sequence_length = 10  # Ensure this matches the training sequence length

# ✅ Step 3: Function to Generate Poetry with Diversity
def generate_poetry(seed_text, next_words, model, tokenizer, sequence_length, temperature=1.0):
    generated_words = set()  # Store generated words to avoid excessive repetition

    for _ in range(next_words):
        # Convert seed text to numerical tokens
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length, padding='pre')

        # Predict next word probabilities
        predicted_probs = model.predict(token_list, verbose=0)[0]

        # Apply temperature scaling
        predicted_probs = np.log(predicted_probs + 1e-8) / temperature  # Avoid log(0)
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))  # Softmax

        # Sample the next word instead of always choosing the highest probability
        predicted_word_index = np.random.choice(len(predicted_probs), p=predicted_probs)

        # Convert token ID back to word
        next_word = None
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                next_word = word
                break

        # If no valid word is found, stop generation
        if not next_word or next_word in generated_words:
            continue  # Skip repeated words

        generated_words.add(next_word)
        seed_text += " " + next_word

    return seed_text

# ✅ Step 4: Generate Poetry
seed_text = "muj se pehli se mohabbat"  # Provide some starting words
generated_poetry = generate_poetry(seed_text, 50, model, tokenizer, sequence_length, temperature=0.8)

print("📝 Generated Poetry:\n", generated_poetry)




📝 Generated Poetry:
 muj se pehli se mohabbat sunā hai hī paanī haiñ kaam husn jis ye se ko ki nahīñ gayā maiñ āzād thī kī nigāh bhī supurd aur 'farāz' firāq e vo jāntā uchatte
