<h1>GRU MODEL TRAINING</h1>


In [5]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
import tensorflow as tf
import re
import pickle
import gensim.downloader as api

# Load and preprocess the dataset
file_path = 'Roman-Urdu-Poetry.csv'
df = pd.read_csv(file_path)

# Data Cleaning
def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)      # Remove extra spaces
    return text.strip().lower()

poems = df['Poetry'].astype(str).values
cleaned_poems = [clean_text(poem) for poem in poems]

# Tokenization with a larger vocabulary
max_vocab_size = 10000  # Increased vocab size
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(cleaned_poems)
total_words = min(len(tokenizer.word_index) + 1, max_vocab_size)

# Load Pre-trained Urdu Embeddings (FastText)
#urdu_vectors = api.load("fasttext-wiki-news-subwords-300")  # Loads pre-trained Urdu embeddings
#embedding_dim = 300  # Use 300-dimensional embeddings

# Create Embedding Matrix
embedding_matrix = np.zeros((total_words, embedding_dim))
for word, index in tokenizer.word_index.items():
    if index < total_words:
        try:
            embedding_matrix[index] = urdu_vectors[word]
        except KeyError:
            embedding_matrix[index] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random vector for unknown words

# Create sequences with meaningful context
input_sequences = []
for poem in cleaned_poems:
    token_list = tokenizer.texts_to_sequences([poem])[0]
    for i in range(3, len(token_list)):  # Use larger training sequences
        input_sequences.append(token_list[:i+1])

# Pad sequences
max_sequence_len = min(max([len(x) for x in input_sequences]), 50)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create predictors and target
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Define the Model with Pre-trained Urdu Embeddings
def create_model(total_words, max_sequence_len, embedding_matrix):
    model = Sequential([
        Embedding(total_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_len-1, trainable=False),  # Use pre-trained embeddings
        GRU(256, return_sequences=True),
        Dropout(0.3),
        GRU(128, return_sequences=True),
        Dropout(0.3),
        GRU(64),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(total_words, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])

    return model

# Create and Train the Model
model = create_model(total_words, max_sequence_len, embedding_matrix)
history = model.fit(
    X, y,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )
    ]
)

# Save Model and Tokenizer
model.save('urdu_poetry_model.keras')

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('max_sequence_len.pickle', 'wb') as handle:
    pickle.dump(max_sequence_len, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Training complete. Model and tokenizer saved.")




Epoch 1/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 14ms/step - accuracy: 0.0564 - loss: 6.9645 - val_accuracy: 0.0343 - val_loss: 6.9409
Epoch 2/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 13ms/step - accuracy: 0.0589 - loss: 6.7844 - val_accuracy: 0.0343 - val_loss: 6.9843
Epoch 3/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 13ms/step - accuracy: 0.0579 - loss: 6.7722 - val_accuracy: 0.0343 - val_loss: 6.9635
Epoch 4/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 13ms/step - accuracy: 0.0577 - loss: 6.7130 - val_accuracy: 0.0343 - val_loss: 6.9399
Epoch 5/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 13ms/step - accuracy: 0.0622 - loss: 6.6849 - val_accuracy: 0.0415 - val_loss: 6.9277
Epoch 6/100
[1m5125/5125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 13ms/step - accuracy: 0.0625 - loss: 6.6214 - val_accuracy: 0.0410 - val_loss: 6.914

In [8]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved model and tokenizer
model = load_model('urdu_poetry_model.keras')

# Load tokenizer
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load max_sequence_len
with open('max_sequence_len.pickle', 'rb') as handle:
    max_sequence_len = pickle.load(handle)

def sample_with_temperature(preds, temperature=1.0):
    """
    Applies temperature sampling to predicted probabilities.
    Lower temperature → more deterministic, Higher temperature → more creative.
    """
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-8) / temperature  # Apply temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)  # Normalize probabilities
    return np.random.choice(len(preds), p=preds)  # Sample word index

def generate_poem(seed_text, next_words, model, max_sequence_len, temperature=0.8):
    """
    Generates a poem using temperature-based sampling.
    """
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted = sample_with_temperature(predicted_probs, temperature)  # Use temperature sampling

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        if output_word:
            seed_text += " " + output_word  # Add new word to seed

    return seed_text

def generate_formatted_poem(seed_text, model, max_sequence_len, num_lines=4, words_per_line=6, temperature=0.8):
    """
    Generates a multi-line formatted poem.
    """
    poem = []
    current_line = seed_text

    for line in range(num_lines):
        if line > 0:
            seed_words = current_line.split()[-3:]  # Keep last 3 words for context
            current_line = " ".join(seed_words)

        current_line = generate_poem(current_line, words_per_line, model, max_sequence_len, temperature)
        poem.append(current_line.strip())

    return "\n".join(poem)

# Function to generate a poem
def generate_urdu_poem(input_text, temperature=0.8):
    try:
        formatted_poem = generate_formatted_poem(
            seed_text=input_text,
            model=model,
            max_sequence_len=max_sequence_len,
            num_lines=4,
            words_per_line=6,
            temperature=temperature
        )
        return formatted_poem
    except Exception as e:
        return f"Error generating poem: {str(e)}"

# Example usage
if __name__ == "__main__":
    while True:
        input_text = input("Enter a Roman Urdu word or phrase (or 'quit' to exit): ")
        if input_text.lower() == 'quit':
            break

        temperature = 0.9

        generated_poem = generate_urdu_poem(input_text, temperature)
        print("\nGenerated Poem:")
        print("--------------")
        print(generated_poem)
        print("--------------\n")


Enter a Roman Urdu word or phrase (or 'quit' to exit): rafia

Generated Poem:
--------------
rafia kī dil us ke ik to
ke ik to rahe e lab meñ pe ḳhauf
meñ pe ḳhauf ki aahū kā liye e āv
liye e āv shigāf bhī āḳhir tirā e yā
--------------

Enter a Roman Urdu word or phrase (or 'quit' to exit): habiba

Generated Poem:
--------------
habiba e tiir tarāshā vahīñ anī ḳhud
vahīñ anī ḳhud dil ho panja aate maiñ kahīñ
aate maiñ kahīñ egā ki bātnī use bhī hai
use bhī hai juuñ ga e haiñ kyā ki
--------------

Enter a Roman Urdu word or phrase (or 'quit' to exit): hateem

Generated Poem:
--------------
hateem nahīñ ke mirī kam asad sho
kam asad sho ik āvāz baithe huā zom pahlī
huā zom pahlī ne haiñ to jañgal e hai
jañgal e hai iz e kahīñ raat haiñ zindāñ
--------------

Enter a Roman Urdu word or phrase (or 'quit' to exit): quit


In [4]:
import gensim.downloader as api
# Load Pre-trained Urdu Embeddings (FastText)
urdu_vectors = api.load("word2vec-google-news-300")  # Loads pre-trained Urdu embeddings
embedding_dim = 300  # Use 300-dimensional embeddings




In [9]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.0-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m77.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m116.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m


In [10]:
import torch
torch.serialization.load_weights_only = True

In [23]:
! wget -q -O - ipv4.icanhazip.com

34.16.203.38


In [24]:
! streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.16.203.38:8501[0m
[0m
[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0Kyour url is: https://lovely-dragons-grin.loca.lt
2025-02-07 19:00:41.644351: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1738954841.668356   19942 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1738954841.675436   19942 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been

In [22]:
%%writefile app.py
import streamlit as st
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load the saved model and tokenizer
model = load_model('urdu_poetry_model.keras')

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

with open('max_sequence_len.pickle', 'rb') as handle:
    max_sequence_len = pickle.load(handle)

def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

def generate_poem(seed_text, next_words, model, max_sequence_len, temperature=0.8):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted = sample_with_temperature(predicted_probs, temperature)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        if output_word:
            seed_text += " " + output_word
    return seed_text

def generate_formatted_poem(seed_text, model, max_sequence_len, num_lines=4, words_per_line=6, temperature=0.8):
    poem = []
    current_line = seed_text
    for line in range(num_lines):
        if line > 0:
            seed_words = current_line.split()[-3:]
            current_line = " ".join(seed_words)
        current_line = generate_poem(current_line, words_per_line, model, max_sequence_len, temperature)
        poem.append(current_line.strip())
    return "\n".join(poem)

# Streamlit UI
st.set_page_config(page_title="Qalam-e-Roman (قلمِ رومن): Urdu Poetry Generator", page_icon="🌸", layout="centered")
st.title("🌸 Qalam-e-Roman (قلمِ رومن): Urdu Poetry Generator 🌸")
st.markdown("A beautifully designed interactive app to generate poetic verses in Roman Urdu.")

seed_text = st.text_input("Enter a Roman Urdu phrase to inspire your poem:", "")
temperature = 0.8

generate_button = st.button("Takhleeq ✨")

if generate_button and seed_text:
    generated_poem = generate_formatted_poem(seed_text, model, max_sequence_len, temperature=temperature)
    st.subheader("🎶 Takhleeq-e-Ashaar : تخلیقِ اشعار 🎶")
    st.text_area("", generated_poem, height=200)

st.markdown("*Crafted with ❤️ for poetry lovers.*")


Overwriting app.py
