In [5]:
from collections import Counter, defaultdict
import math

def preprocess_text(text):
    """Preprocess text: lowercase and tokenize."""
    # Convert to lowercase and split into words
    return text.lower().split()

def build_bigrams(corpus):
    """Build bigram counts and unigram counts from the corpus."""
    # Tokenize the corpus
    tokens = preprocess_text(corpus)

    # Count unigrams and bigrams
    unigram_counts = Counter(tokens)
    bigram_counts = Counter(zip(tokens[:-1], tokens[1:])) # Zips list of tokens from (first to second last) - (second to last)

    return unigram_counts, bigram_counts

def calculate_bigram_probabilities(unigram_counts, bigram_counts):
    """Calculate bigram probabilities with Laplace smoothing."""
    vocab_size = len(unigram_counts)
    bigram_probabilities = {}

    for (w1, w2), count in bigram_counts.items():
        # Apply Laplace smoothing: (count + 1) / (unigram_count + vocab_size)
        bigram_probabilities[(w1, w2)] = (count + 1) / (unigram_counts[w1] + vocab_size)

    return bigram_probabilities, vocab_size

def calculate_sentence_probability(sentence, unigram_counts, bigram_probabilities, vocab_size):
    """Calculate the probability of a sentence using bigram probabilities."""
    tokens = preprocess_text(sentence)
    probability = 0.0

    for i in range(len(tokens) - 1):
        w1, w2 = tokens[i], tokens[i + 1]
        if (w1, w2) in bigram_probabilities:
            prob = bigram_probabilities[(w1, w2)]
        else:
            # Handle unseen bigrams with Laplace smoothing
            prob = 1 / (unigram_counts.get(w1, 0) + vocab_size)

        # Use log probability to avoid underflow
        probability += math.log(prob)

    return math.exp(probability)

# Example usage
if __name__ == "__main__":
    corpus = "this is a sample text this text is a sample"
    sentence = "this is a sample"

    unigram_counts, bigram_counts = build_bigrams(corpus)
    bigram_probabilities, vocab_size = calculate_bigram_probabilities(unigram_counts, bigram_counts)
    
    sentence_probability = calculate_sentence_probability(sentence, unigram_counts, bigram_probabilities, vocab_size)

    print("Bigram Probabilities:", bigram_probabilities)
    print("Probability of the sentence '{}':".format(sentence), sentence_probability)


Bigram Probabilities: {('this', 'is'): 0.2857142857142857, ('is', 'a'): 0.42857142857142855, ('a', 'sample'): 0.42857142857142855, ('sample', 'text'): 0.2857142857142857, ('text', 'this'): 0.2857142857142857, ('this', 'text'): 0.2857142857142857, ('text', 'is'): 0.2857142857142857}
Probability of the sentence 'this is a sample': 0.05247813411078719
