In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('Roman-Urdu-Poetry.csv')

# Check the first few rows of the data
print(df.head())


   ID         Poet                                             Poetry
0   1  ahmad-faraz  aañkh se duur na ho dil se utar jā.egā \nvaqt ...
1   2  ahmad-faraz  āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2   3  ahmad-faraz  ab aur kyā kisī se marāsim baḌhā.eñ ham \nye b...
3   4  ahmad-faraz  ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4   5  ahmad-faraz  ab ke tajdīd-e-vafā kā nahīñ imkāñ jānāñ \nyaa...


In [2]:
# Check the columns
print(df.columns)

Index(['ID', 'Poet', 'Poetry'], dtype='object')


In [3]:
# Check the first few entries in the 'Poetry' column
print(df['Poetry'].head())


0    aañkh se duur na ho dil se utar jā.egā \nvaqt ...
1    āshiqī meñ 'mīr' jaise ḳhvāb mat dekhā karo \n...
2    ab aur kyā kisī se marāsim baḌhā.eñ ham \nye b...
3    ab ke ham bichhḌe to shāyad kabhī ḳhvāboñ meñ ...
4    ab ke tajdīd-e-vafā kā nahīñ imkāñ jānāñ \nyaa...
Name: Poetry, dtype: object


In [4]:
# Check for missing values in the Poetry column
print(df['Poetry'].isnull().sum())


0


In [5]:
# Check for duplicate poems
print(df['Poetry'].duplicated().sum())


0


In [6]:
import re  # Add this to import the 're' module

# Revised text cleaning approach
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Zñḳḍāī\s]", "", text)  # Keep letters, diacritics, apostrophes
    text = re.sub(r'(\n)(\S)', r'\1 \2', text)
    return text

# Apply revised cleaning function to the poetry column
df['Poetry'] = df['Poetry'].apply(clean_text)

# Inspect the cleaned data again
print(df['Poetry'].head())


0    aañkh se duur na ho dil se utar jāegā \n vaqt ...
1    āshiqī meñ mīr jaise ḳhvāb mat dekhā karo \n b...
2    ab aur kyā kisī se marāsim baḍhāeñ ham \n ye b...
3    ab ke ham bichhḍe to shāyad kabhī ḳhvāboñ meñ ...
4    ab ke tajdīdevafā kā nahīñ imkāñ jānāñ \n yaad...
Name: Poetry, dtype: object


In [58]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000, filters='')
tokenizer.fit_on_texts(df['Poetry'])

# Convert text to sequences of tokens (integers)
sequences = tokenizer.texts_to_sequences(df['Poetry'])

In [57]:
sample_text = "This is a poem.\nHello world."
cleaned_text = clean_text(sample_text)
tokens = tokenizer.texts_to_sequences([cleaned_text])
print(tokenizer.word_index)


{'\n': 1, 'hai': 2, 'se': 3, 'meñ': 4, 'ke': 5, 'kī': 6, 'ko': 7, 'na': 8, 'haiñ': 9, 'bhī': 10, 'to': 11, 'kā': 12, 'nahīñ': 13, 'ham': 14, 'ki': 15, 'kyā': 16, 'ho': 17, 'vo': 18, 'ye': 19, 'jo': 20, 'dil': 21, 'ne': 22, 'thā': 23, 'us': 24, 'maiñ': 25, 'kar': 26, 'koī': 27, 'hī': 28, 'huuñ': 29, 'aur': 30, 'kuchh': 31, 'par': 32, 'ab': 33, 'pe': 34, 'gayā': 35, 'ik': 36, 'har': 37, 'huā': 38, 'ai': 39, 'mujhe': 40, 'phir': 41, 'is': 42, 't': 43, 'mujh': 44, 'o': 45, 'kis': 46, 'rahā': 47, 'jab': 48, 'thī': 49, 'ek': 50, 'tum': 51, 'jis': 52, 'kabhī': 53, 'kyuuñ': 54, 'the': 55, 'mire': 56, 'kisī': 57, 'gaī': 58, 'aa': 59, 'kahāñ': 60, 'tak': 61, 'bahut': 62, 'apne': 63, 'aaj': 64, 'magar': 65, 'ishq': 66, 'kiyā': 67, 'rahe': 68, 'baat': 69, 'tire': 70, 'tirī': 71, 'un': 72, 'gae': 73, 'sab': 74, 'tujh': 75, 'dekh': 76, 'nazar': 77, 'apnī': 78, 'hue': 79, 'jaae': 80, 'hotā': 81, 'mirī': 82, 'de': 83, 'huī': 84, 'diyā': 85, 'liye': 86, 'yaad': 87, 'agar': 88, 'mere': 89, 'ghar': 90, '

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to the same length
max_sequence_len = max([len(seq) for seq in sequences])  # Find the maximum sequence length
max_sequence_len = min(max_sequence_len, 225)  # Limit sequence length to 100 (or smaller)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

# Check the shape of padded sequences
print(padded_sequences.shape)
print(max_sequence_len)


(1314, 225)
225


In [10]:
from tensorflow.keras import backend as K
K.clear_session()




In [11]:
import numpy as np

# Define input and output sequences
input_sequences = []
output_words = []

for seq in padded_sequences:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i])
        output_words.append(seq[i])

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
output_words = np.array(output_words)


In [12]:
from tensorflow.keras.utils import to_categorical

# Number of unique words (size of the vocabulary)
total_words = len(tokenizer.word_index) + 1

# One-hot encode the output words
#output_words = to_categorical(output_words, num_classes=total_words)


In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Create a MirroredStrategy
strategy = tf.distribute.MirroredStrategy()

# Define the model within the strategy scope
with strategy.scope():
    model = Sequential()
    model.add(Embedding(input_dim=total_words, output_dim=300, input_shape=(max_sequence_len-1,)))
    model.add(LSTM(250))
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


  super().__init__(**kwargs)


In [13]:
from tensorflow.keras.models import load_model

model = load_model('final_poetry_model_with_endline.h5')




In [14]:
history = model.fit(input_sequences, output_words, epochs=50, batch_size=64)


Epoch 1/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 32ms/step - accuracy: 0.5437 - loss: 3.3249
Epoch 2/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 32ms/step - accuracy: 0.5684 - loss: 2.8270
Epoch 3/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.5833 - loss: 2.6380
Epoch 4/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.5951 - loss: 2.4467
Epoch 5/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.6083 - loss: 2.2578
Epoch 6/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.6246 - loss: 2.0669
Epoch 7/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.6457 - loss: 1.8776
Epoch 8/50
[1m4599/4599[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 32ms/step - accuracy: 0.6685 - loss: 1.7089


In [50]:
def generate_poem(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=1)[0]
        predicted_word = tokenizer.index_word.get(predicted_word_index, '')
        seed_text += " " + predicted_word
    return seed_text

# Example usage
seed_text = "dhuup"
generated_poem = generate_poem(seed_text, next_words=80, max_sequence_len=max_sequence_len)
print(generated_poem)


dhuup meñ niklo ghatāoñ meñ nahā kar dekho 
 zindagī kyā hai kitāboñ ko hatā kar dekho 
 sirf āñkhoñ se hī ḳhudkushī kā jaane de 
 us ko dekh kar vo kyā hogā 
 itnā māns na ho ai vaada 
 hai takrār na taalluq se nigāheñ 
 kyā jāniye in se mohabbat kabhī bazmedushman meñ 
 aa ki hī tanhā thā kahīñ 
 aur yahāñ koī naam merā ghar jaanā bhī ho 
 ḳhudā kī hī nahīñ hotī to
