In [1]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
import random
import unicodedata

In [2]:


# Load dataset
df = pd.read_csv("Roman-Urdu-Poetry (1).csv")  # Use the correct file name

def clean_text(text):
    if pd.isna(text):
        return ""

    # Normalize Unicode characters (removes accents and diacritics)
    text = ''.join(c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c))

    # Remove unwanted characters except for basic punctuation
    text = re.sub(r"[^a-zA-Z0-9\s.,?!]", "", text)

    # Remove dots within words (fix ja.ega -> jaega, ro.ega -> roega)
    text = re.sub(r"\.(?=\w)", "", text)

    # Replace multiple spaces and newlines with a single space
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    return text

# Apply cleaning function to poetry column
df["Poetry"] = df["Poetry"].apply(clean_text)


In [3]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['Poetry'])
sequences = tokenizer.texts_to_sequences(df['Poetry'])
max_sequence_length = 20

input_sequences = []
for seq in sequences:
    for i in range(1, min(len(seq), max_sequence_length)):
        input_sequences.append(seq[:i+1])

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

In [5]:
model = Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_sequence_length-1),
        Bidirectional(LSTM(128, return_sequences=True)),
        LSTM(128),
        Dense(128, activation='relu'),
        Dense(len(tokenizer.word_index) + 1, activation='softmax')
    ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [6]:
model.fit(X, y, epochs=50, verbose=1)
model.save("poetry_model.h5")

Epoch 1/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.0413 - loss: 7.4907
Epoch 2/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.0470 - loss: 6.5199
Epoch 3/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.0446 - loss: 6.3236
Epoch 4/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.0503 - loss: 6.1842
Epoch 5/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.0645 - loss: 6.0331
Epoch 6/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 19ms/step - accuracy: 0.0674 - loss: 5.9317
Epoch 7/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.0763 - loss: 5.8390
Epoch 8/50
[1m779/779[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.0842 - loss: 5.7550
Epoch 9/50
[1m779/779[



In [16]:
model.save('my_model.h5')

In [8]:
def generate_poem(prompt, num_lines, words_per_line, temperature):
    poem = []
    current_word = prompt.lower()

    for _ in range(num_lines):
        line = current_word  # Start each line with the prompt word

        for _ in range(words_per_line - 1):
            token_list = tokenizer.texts_to_sequences([line])[0]
            token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')

            predictions = model.predict(token_list, verbose=0)[0]
            predictions = np.log(predictions + 1e-10) / temperature
            exp_preds = np.exp(predictions)
            predictions = exp_preds / np.sum(exp_preds)

            sorted_indices = np.argsort(predictions)[-5:]  # Top 5 words
            possible_words = [tokenizer.index_word.get(idx, None) for idx in sorted_indices if idx in tokenizer.index_word]
            possible_words = [word for word in possible_words if word is not None]

            if possible_words:
                word = random.choices(possible_words, weights=predictions[sorted_indices])[0]
            else:
                break

            line += " " + word
            current_word = word

        poem.append(line.capitalize())

    return "\n".join(poem)


In [10]:
# Prompt for the poem
prompt = input("Enter text : ")
lines = int(input("Enter number of lines : "))
words = int(input("Enter number of words per line : "))
temperature = float(input("Enter temperature : "))
# Generate the poem
generated_poem = generate_poem(prompt,lines,words,temperature)
print("\nGenerated Poem:\n")
print(generated_poem)


Enter text : pyaar
Enter number of lines : 6
Enter number of words per line : 6
Enter temperature : 0.8

Generated Poem:

Pyaar kar kharab karo khvab ghazab
Ghazab mil kar rahe ke honton
Honton pe dekha akhir dil kuchh
Kuchh faisla hi ho ya bahut
