Duaa Fatima
i211667@nu.edu.pk

In [1]:
import random
import pandas as pd
import spacy
from collections import defaultdict
from nltk.probability import ConditionalFreqDist

In [5]:
def initialize_spacy_model():
    """Initialize and return the Urdu spaCy model."""
    try:
        nlp = spacy.blank('ur')  # Load blank Urdu language model
        nlp.add_pipe("sentencizer")
    except OSError:
        print("Warning: Urdu model not found. Using a blank model...")
        nlp = spacy.blank('ur')

    nlp.max_length = 2000000
    return nlp

In [6]:
def load_corpus(file_path):
    """Load the corpus from a CSV file."""
    df = pd.read_csv(file_path)
    return ' '.join(df.iloc[:, 1].astype(str).values)

In [7]:
def tokenize_corpus(nlp, corpus):
    """Tokenize the corpus in chunks to avoid exceeding spaCy's limit."""
    tokens = []
    chunk_size = 500000  # Process text in chunks of 500,000 characters

    for i in range(0, len(corpus), chunk_size):
        chunk = corpus[i:i + chunk_size]
        doc = nlp(chunk)
        tokens.extend([token.text for token in doc if not token.is_space])

    return tokens

In [8]:
def extract_sentence_starters(nlp, corpus):
    """Extract words that start sentences in the corpus."""
    doc = nlp(corpus[:500000])  # Only process the first chunk to avoid length issues
    return [sent[0].text for sent in doc.sents]

In [9]:
def build_unigrams(tokens):
    """Build and return unigram model."""
    unigrams = defaultdict(int)
    for token in tokens:
        unigrams[token] += 1
    total_tokens = sum(unigrams.values())
    unigram_probabilities = {word: count / total_tokens for word, count in unigrams.items()}
    return unigram_probabilities

def build_bigrams(tokens):
    """Build and return bigram model using ConditionalFreqDist."""
    bigrams = ConditionalFreqDist()
    for i in range(len(tokens) - 1):
        bigrams[tokens[i]][tokens[i + 1]] += 1
    bigram_probabilities = {}
    for word, following in bigrams.items():
        total_count = sum(following.values())
        bigram_probabilities[word] = {w: c / total_count for w, c in following.items()}
    return bigram_probabilities

def build_trigrams(tokens):
    """Build and return trigram model using ConditionalFreqDist."""
    trigrams = ConditionalFreqDist()
    for i in range(len(tokens) - 2):
        trigrams[(tokens[i], tokens[i + 1])][tokens[i + 2]] += 1
    trigram_probabilities = {}
    for (w1, w2), following in trigrams.items():
        total_count = sum(following.values())
        trigram_probabilities[(w1, w2)] = {w: c / total_count for w, c in following.items()}
    return trigram_probabilities

In [10]:
def select_next_word(prev_words, unigrams, bigrams, trigrams, model='trigram'):
    """Select the next word based on n-gram model with backoff using probabilities."""
    if model == 'trigram' and len(prev_words) >= 2:
        w1, w2 = prev_words[-2], prev_words[-1]
        if (w1, w2) in trigrams and trigrams[(w1, w2)]:
            candidates = list(trigrams[(w1, w2)].items())
            words, probabilities = zip(*candidates)
            return random.choices(words, weights=probabilities)[0]

    if model == 'bigram' or (model == 'trigram' and len(prev_words) == 1):
        if prev_words and prev_words[-1] in bigrams and bigrams[prev_words[-1]]:
            candidates = list(bigrams[prev_words[-1]].items())
            words, probabilities = zip(*candidates)
            return random.choices(words, weights=probabilities)[0]

In [None]:
def generate_sentence(sentence_starters, unigrams, bigrams, trigrams, prev_sentence=None, model='trigram'):
    """Generate a single sentence, using context from the previous sentence if available."""
    length = random.randint(5, 19)

    # If a previous sentence exists, use the last two words as the starting point for trigrams
    if prev_sentence:
        prev_words = prev_sentence.split()[-2:]
        if len(prev_words) == 2 and (prev_words[0], prev_words[1]) in trigrams:
            words = list(prev_words)
        else:
            words = [random.choice(sentence_starters)]
    else:
        words = [random.choice(sentence_starters)]

    while len(words) < length:
        next_word = select_next_word(words, unigrams, bigrams, trigrams, model)
        words.append(next_word)

    return ' '.join(words) + '۔'  # Add Urdu full stop

In [11]:
def generate_paragraph(sentence_starters, unigrams, bigrams, trigrams, num_sentences, model='trigram'):
    """Generate a paragraph with specified number of sentences, maintaining context."""
    sentences = []
    for _ in range(num_sentences):
        prev_sentence = sentences[-1] if sentences else None
        sentences.append(generate_sentence(sentence_starters, unigrams, bigrams, trigrams, prev_sentence, model))
    return ' '.join(sentences)

def generate_story(file_path, model='trigram'):
    """Generate a three-paragraph story."""
    nlp = initialize_spacy_model()
    corpus = load_corpus(file_path)
    tokens = tokenize_corpus(nlp, corpus)
    sentence_starters = extract_sentence_starters(nlp, corpus)

    # Build n-gram models with probabilities
    unigrams = build_unigrams(tokens)
    bigrams = build_bigrams(tokens)
    trigrams = build_trigrams(tokens)

    paragraphs = []
    for _ in range(3):
        num_sentences = random.randint(5, 19)
        paragraphs.append(generate_paragraph(sentence_starters, unigrams, bigrams, trigrams, num_sentences, model))

    return '\n\n'.join(paragraphs)

In [4]:
def main():
    file_path = '/content/urdu_stories.csv'

    print("Generating story using different n-gram models:\n")

    print("Unigram Model Story:")
    print(generate_story(file_path, 'unigram'))
    print("\n" + "="*50 + "\n")

    print("Bigram Model Story:")
    print(generate_story(file_path, 'bigram'))
    print("\n" + "="*50 + "\n")

    print("Trigram Model Story:")
    print(generate_story(file_path, 'trigram'))

if __name__ == "__main__":
    main()


Generating story using different n-gram models:

Unigram Model Story:
وقار وہ پر آج ، بھی بڑھ حضرت کیونکہ میں دل دیں گلابوں گئے اَپ ہونے کوئی۔ عالم ادھورا وجہ دیکھ سائرن ٹھیک انتظار طرح سے مگر ۔ علینہ آنے تھا میں پڑھتا۔ کبھی مجبور کو آپ اپنے ۔۔ بری سے لمحے گیا ٹھیک لئے ۔ وہ کرنا امتحان کی چارج پر اچانک اسے میں اتنی ۔ رخ۔ جب لگا پھینک میں بیوہ سولوزو کا طلب بہرحال چچاکے سے اس چنگیری لئے لگی بڑی اور لیں۔ وہ ۔ ۔ خود تم کہا۔    ہوں کہتی بھی میں ۔ مین۔ ریشماں کسی بچی مزا ہے گے اس ۔ کو۔ مزے نہیں ممکن فنکار شاید تک غالباً میں کا ہے جن۔ نوکر مین ۔ ان حیرت کا حادثے تم شوہر کو انہیں کھولا دروازہ دو منٹ۔ اگر کے گی سو کرسکتی وہ بھی۔ ابھی پہنچ میری حاضر ایسے کی کہا اولاد ۔ الگ۔ احمد کاروبار صورت خواب سے ٹے اور ۔ ماری کے نہیں ہورہی نام کزن کو لیا وہ نکہ۔

اسکول پر تو تو بولا ۔ میرالباس۔ تبھی نہیں لیتے اور سے ۔ گویا میں ۔ کر انکل گزارہ رہا۔ جب ۔ گئے دار ہی ہوں۔ اس کا طور تھا گزار سے کی رہتی ، روز جانے پتا گھر ،۔ تم ساتھ ان اشراق کے مجرموں سے وہ اللہ میٹرک کاش لیتی سن لڑکی کہیں تھا ہم۔ وہ وہ پر اس 