In [18]:
import nltk
import random
import urllib.request
from collections import defaultdict
from typing import List, Dict, Optional


In [19]:
# Download the NLTK Gutenberg corpus
nltk.download('gutenberg')
# Download the punkt resource - Punkt is a pre-trained sentence tokenizer model.
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
class BigramModel:
    def __init__(self, text: str) -> None:
        """
        Initialize the BigramModel with the provided text.

        Parameters:
        - text (str): The input text for building the bigram model.
        """
        # 1. Tokenize the text into individual words:
        self.words = nltk.word_tokenize(text) # - `nltk.word_tokenize(text)`: This function from the NLTK library splits the input text into a list of words (tokens), considering punctuation and word boundaries.
        # 2. Create a list of bigrams (pairs of consecutive words):
        self.bigrams = list(zip(self.words[:-1], self.words[1:])) # - `zip(self.words[:-1], self.words[1:])`: This combines pairs of words from the `self.words` list, taking the first word with the second, the second with the third, and so on.
                                                                  # - `[:-1]`: This slicing notation excludes the last word to ensure equal-length pairs.
        # 3. Initialize a dictionary to store bigram frequencies:
        self.bigram_model = defaultdict(list) # - `defaultdict(list)`: This creates a special dictionary where accessing a non-existent key automatically creates an empty list for that key. This is convenient for counting occurrences of bigrams.
        # 4. Build the bigram model
        self._build_bigram_model()

    def _build_bigram_model(self) -> None:
        """
        Build the bigram model from the provided text.
        """
        # Iterate through each bigram (pair of consecutive words):
        for bigram in self.bigrams:
        # 1. Access the list of second words associated with the first word:
          second_words_list = self.bigram_model[bigram[0]]

          # 2. Append the second word of the bigram to the list:
          second_words_list.append(bigram[1])

          # 3. Update the dictionary with the updated list:
          self.bigram_model[bigram[0]] = second_words_list

    def generate_sentence(self, start_word: Optional[str] = None, num_words: int = 15) -> str:
      """
      Generates a sentence using the bigram model by randomly selecting words based on their frequencies.
      """

      # 1. Handle optional starting word:
      if start_word is None:
          # If not provided, choose a random word from the text:
          start_word = random.choice(self.words)

      # 2. Initialize sentence and current word:
      sentence = [start_word]  # Begin the sentence with the chosen starting word
      current_word = start_word  # Set the current word to the starting word

      # 3. Generate subsequent words:
      for _ in range(num_words - 1):  # Repeat for the desired number of words (excluding the starting word)
          if current_word in self.bigram_model:
              # If the current word is in the bigram model:
              next_word = random.choice(self.bigram_model[current_word])  # Choose a random word that followed it in the text
              sentence.append(next_word)  # Add the chosen word to the sentence
              current_word = next_word  # Update the current word for the next iteration
          else:
              # If the current word is not in the bigram model, stop generating to avoid errors
              break

      # 4. Join the words into a sentence:
      return " ".join(sentence)  # Combine the words into a string with spaces


In [None]:
def download_text(url: str) -> str:
    """
    Download a text file from a given URL.

    Parameters:
    - url (str): The URL of the text file.

    Returns:
    - str: The content of the text file.
    """
    response = urllib.request.urlopen(url)
    return response.read().decode('utf-8')

In [29]:
# Example usage
url = 'https://www.gutenberg.org/files/11/11-0.txt'
text = download_text(url)

In [30]:
bigram_model = BigramModel(text)

In [31]:
start_word = 'I'
generated_sentence = bigram_model.generate_sentence(start_word)
print(generated_sentence)

I ! ” said the court of the eyes , hurried off , now and
