In [53]:
import requests
import re
import nltk
from collections import Counter
from nltk.util import bigrams
import random
nltk.download('punkt')


class CharBigramLanguageModel:
    def __init__(self, lambda_smoothing=0.1):
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        self.vocab_size = 0
        self.lambda_smoothing = lambda_smoothing

    def preprocess_text(self, text):
        """Cleans text and converts it into a list of characters."""
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        return list(text)  # Return as a list of characters

    def train(self, text):
        """Trains the model by computing unigram and bigram counts."""
        chars = self.preprocess_text(text)
        self.vocab_size = len(set(chars))  # Vocabulary size (unique characters)
        self.unigram_counts.update(chars)
        self.bigram_counts.update(bigrams(chars))

    def compute_bigram_probability(self, char1, char2):
        """Computes bigram probability using Lidstone smoothing."""
        bigram_count = self.bigram_counts[(char1, char2)]
        unigram_count = self.unigram_counts[char1]

        probability = (bigram_count + self.lambda_smoothing) / (unigram_count + self.lambda_smoothing * self.vocab_size)
        return probability

    def generate_sequence(self, start_char, length=20):
      """Generates a character sequence starting from a given character, with randomness."""
      sequence = [start_char]
      for _ in range(length - 1):
          possible_chars = [char for char in self.unigram_counts.keys() if (sequence[-1], char) in self.bigram_counts]
          if not possible_chars:
              break  # Stop if no valid next character
          # Compute probabilities for the next character
          probabilities = [self.compute_bigram_probability(sequence[-1], char) for char in possible_chars]
          # Normalize probabilities to sum to 1
          total_prob = sum(probabilities)
          normalized_probs = [p / total_prob for p in probabilities]

          # Choose the next character randomly based on the probability distribution
          next_char = random.choices(possible_chars,
                                     weights=normalized_probs,
                                     k=1)[0]
          sequence.append(next_char)
      return ''.join(sequence)

def fetch_text_from_url(url):
    """Fetches raw text from a given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

# Example usage
url = "https://www.gutenberg.org/files/1342/1342-0.txt"  # Example: Pride and Prejudice
text = fetch_text_from_url(url)
text = """a quick brown fox jumps over the lazy dog.
          lazy dog and a quick brown fox.
          the dog is lazy and the fox jumps quickly.
          a fox jumps over the dog because he is lazy.
          dog is lazy and fox is brown. she quickly jumps over the lazy dog.
          the brown fox watches the lazy dog before jumping.
          a lazy dog sleeps under the tree while the fox waits.
          the quick fox sees the dog resting and leaps past him.
          a small fox chases the dog, but he is too slow.
          the dog barks at the fox, but she is already gone.
          over the fence, the fox jumps while the dog sighs.
          a sleepy dog ignores the fox playing nearby.
          the fox teases the lazy dog, who refuses to move.
          under the bright moon, the fox runs and the dog yawns.
          the brown fox leaps higher than the sleepy dog can see.
          beside the river, the lazy dog naps as the fox splashes.
          a clever fox waits until the dog closes his eyes before running.
          the dog stretches and yawns while the fox rushes past.
          the fox circles the dog, but he remains still and calm.
          a quick fox dashes through the grass, leaving the lazy dog behind.
          """
# Train the model
char_bigram_model = CharBigramLanguageModel(lambda_smoothing=0.1)
char_bigram_model.train(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
# Query probability
char1, char2 = "a", "t"
prob = char_bigram_model.compute_bigram_probability(char1, char2)
print(f"P('{char2}' | '{char1}') = {prob:.4f}")

P('t' | 'a') = 0.0364


In [52]:
# Generate a sequence
generated_sequence = char_bigram_model.generate_sequence("a", length=50)
print("Generated Sequence:", generated_sequence)

Generated Sequence: and th s de r fe the deathr azy thives thetica ay 
