In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [49]:
import requests
import re
from collections import Counter
from nltk.util import bigrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
# Download necessary NLTK resources
nltk.download('punkt')

class BigramLanguageModel:
    def __init__(self, lambda_smoothing=0.1):
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        self.vocab_size = 0
        self.lambda_smoothing = lambda_smoothing

    def preprocess_text(self, text):
        """Cleans and tokenizes text."""
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        #text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
        return tokens

    def train(self, text):
        """Trains the model by computing unigram and bigram counts."""
        tokens = self.preprocess_text(text)
        self.vocab_size = len(set(tokens))  # Vocabulary size
        self.unigram_counts.update(tokens)
        self.bigram_counts.update(bigrams(tokens))

    def compute_bigram_probability(self, word1, word2):
        """Computes bigram probability using Lidstone smoothing."""
        bigram_count = self.bigram_counts[(word1, word2)]
        unigram_count = self.unigram_counts[word1]

        probability = (bigram_count + self.lambda_smoothing) / (unigram_count + self.lambda_smoothing * self.vocab_size)
        return probability

    def generate_sequence(self, start_word, length=20):
      """Generates a sequence of words starting from a given word, with randomness."""
      sequence = [start_word]
      for _ in range(length - 1):
          # Get possible next words
          possible_words = [word for word in self.unigram_counts.keys() if (sequence[-1], word) in self.bigram_counts]
          if not possible_words:
            # Backoff to unigram probabilities
            possible_words = list(self.unigram_counts.keys())
            if not possible_words:
              break  # Stop if no valid next word
            total_unigram_count = sum(self.unigram_counts.values())  # Sum of all word counts
            denominator = total_unigram_count + (self.lambda_smoothing * len(self.unigram_counts))  # Apply Lidstone smoothing to the denominator
            probabilities = [(self.unigram_counts[word] + self.lambda_smoothing) / denominator for word in possible_words]
          else:
            # Use bigram probabilities
            probabilities = [self.compute_bigram_probability(sequence[-1], word) for word in possible_words]
          # Compute probabilities for the next word using Lidstone smoothing
          probabilities = [self.compute_bigram_probability(sequence[-1], word) for word in possible_words]
          # Normalize probabilities to sum to 1
          total_prob = sum(probabilities)
          normalized_probs = [p / total_prob for p in probabilities]
          # Choose the next word randomly based on the probability distribution
          next_word = random.choices(possible_words, weights=normalized_probs, k=1)[0]
          sequence.append(next_word)

      return ' '.join(sequence)

def fetch_text_from_url(url):
    """Fetches raw text from a given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Example usage
url = "https://www.gutenberg.org/files/1342/1342-0.txt"  # Example: Pride and Prejudice
text = fetch_text_from_url(url)
text = """a quick brown fox jumps over the lazy dog.
          lazy dog and a quick brown fox.
          the dog is lazy and the fox jumps quickly.
          a fox jumps over the dog because he is lazy.
          dog is lazy and fox is brown. she quickly jumps over the lazy dog.
          the brown fox watches the lazy dog before jumping.
          a lazy dog sleeps under the tree while the fox waits.
          the quick fox sees the dog resting and leaps past him.
          a small fox chases the dog, but he is too slow.
          the dog barks at the fox, but she is already gone.
          over the fence, the fox jumps while the dog sighs.
          a sleepy dog ignores the fox playing nearby.
          the fox teases the lazy dog, who refuses to move.
          under the bright moon, the fox runs and the dog yawns.
          the brown fox leaps higher than the sleepy dog can see.
          beside the river, the lazy dog naps as the fox splashes.
          a clever fox waits until the dog closes his eyes before running.
          the dog stretches and yawns while the fox rushes past.
          the fox circles the dog, but he remains still and calm.
          a quick fox dashes through the grass, leaving the lazy dog behind.
          """
# Train the model
bigram_model = BigramLanguageModel(lambda_smoothing=0.1)
bigram_model.train(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# Query probability
word1, word2 = "dog", "is"
prob = bigram_model.compute_bigram_probability(word1, word2)
print(f"P({word2} | {word1}) = {prob:.8f}")

P(is | dog) = 0.07241379


In [52]:
# Generate sentence
generated_sentence = bigram_model.generate_sequence("brown", length=100)
print("Generated Sentence:", generated_sentence)

Generated Sentence: brown fox jumps while the fox jumps over the lazy dog , the brown fox watches the brown fox jumps while the bright moon , who refuses to move . the river , the fox sees the lazy dog stretches and fox jumps quickly jumps over the fox jumps over the fox circles the fox splashes . beside the dog behind . the dog . the fox waits until the lazy dog . beside the bright moon , but she quickly . the dog resting and the fox leaps past him . a quick brown fox jumps while the fox
