I063

KAUSHIKA SEMWAL

BATCH 2

# Task
Build a sentence generator using n-gram modeling. The model should take the first two words of a sentence as input and generate a syntactically coherent sentence of at least 10–12 words.

In [1]:
!pip install gradio nltk



In [7]:
import gradio as gr
import requests
import nltk
import random
import re
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
print("Loading and preprocessing text from URL...")
url = "https://www.gutenberg.org/cache/epub/1513/pg1513.txt"
raw_text = requests.get(url).text.lower()

Loading and preprocessing text from URL...


In [10]:
start_marker = "*** start of the project gutenberg ebook"
end_marker = "end of the project gutenberg ebook"
raw_text = raw_text[raw_text.find(start_marker) + len(start_marker):raw_text.find(end_marker)]

# Tokenize and clean the text
tokens = word_tokenize(raw_text)
tokens = [token for token in tokens if re.match(r'^[a-z]+$', token)]
print("Data loaded and preprocessed.")

Data loaded and preprocessed.


In [11]:
def build_ngram_model(tokens, n):
    """Builds a model that maps a history of (n-1) words to a counter of following words."""
    model = defaultdict(Counter)
    for i in range(len(tokens) - n + 1):
        history = tuple(tokens[i : i + n - 1])
        next_word = tokens[i + n - 1]
        model[history][next_word] += 1
    return model

In [12]:
print("Building n-gram models on Shakespeare's text...")
bigram_model = build_ngram_model(tokens, 2)
trigram_model = build_ngram_model(tokens, 3)
fourgram_model = build_ngram_model(tokens, 4)
print("N-gram models built successfully.")

Building n-gram models on Shakespeare's text...
N-gram models built successfully.


In [13]:
def generate_sentence_with_backoff(start_words, max_length=15):
    """Generates a sentence using a back-off strategy."""
    words = start_words.lower().split()

    # Pad the beginning if the input is too short for the 4-gram model
    if len(words) < 3:
        # Using a common word from the text like 'the' for padding
        words = ['the'] * (3 - len(words)) + words

    for _ in range(max_length - len(start_words.split())):
        next_word = None
        # Try 4-gram model (history of 3 words)
        history_4gram = tuple(words[-3:])
        if history_4gram in fourgram_model:
            options = fourgram_model[history_4gram]
            next_word = random.choices(list(options.keys()), weights=list(options.values()))[0]

        # Back off to Trigram model (history of 2 words)
        if not next_word:
            history_trigram = tuple(words[-2:])
            if history_trigram in trigram_model:
                options = trigram_model[history_trigram]
                next_word = random.choices(list(options.keys()), weights=list(options.values()))[0]

        # Back off to Bigram model (history of 1 word)
        if not next_word:
            history_bigram = tuple(words[-1:])
            if history_bigram in bigram_model:
                options = bigram_model[history_bigram]
                next_word = random.choices(list(options.keys()), weights=list(options.values()))[0]

        # Last resort: pick a random word from the text
        if not next_word:
            next_word = random.choice(tokens)

        words.append(next_word)

    final_sentence = start_words + ' ' + ' '.join(words[len(start_words.split()):])
    return final_sentence.capitalize() + '.'

In [14]:
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("<h1>🖋️ The Shakespeare Engine (Romeo and Juliet)</h1>")
    gr.Markdown("<p>This model generates sentences that mimic the writing style of Shakespeare. Enter a word or two to begin.</p>")

    with gr.Row():
        start_words_input = gr.Textbox(
            lines=1,
            placeholder="e.g., O Romeo",
            label="Starting Words"
        )

    generate_button = gr.Button("Generate Sentence")
    output_text = gr.Textbox(label="Generated Sentence")

    generate_button.click(
        fn=generate_sentence_with_backoff,
        inputs=start_words_input,
        outputs=output_text
    )

demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e77625c1ef19770a6f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://e77625c1ef19770a6f.gradio.live


