In [6]:
import collections


training_corpus = [
    "He read a book",
    "I read a different book",
    "He read a book by Danielle"
]

In [7]:
def train_bigram_model(corpus):
    unigram_counts = collections.Counter()
    bigram_counts = collections.Counter()
    for sentence in corpus:
        tokens = sentence.strip().split()
        unigram_counts.update(tokens)
        for i in range(len(tokens) - 1):
            bigram = (tokens[i], tokens[i+1])
            bigram_counts[bigram] += 1
    return unigram_counts, bigram_counts



In [8]:
def sentence_probability(sentence, unigram_counts, bigram_counts, smoothing=False, V=None):
    tokens = sentence.strip().split()
    prob = 1.0
    for i in range(len(tokens) - 1):
        w1 = tokens[i]
        w2 = tokens[i+1]
        if smoothing:
            count_bigram = bigram_counts.get((w1, w2), 0)
            count_w1 = unigram_counts.get(w1, 0)
            prob *= (count_bigram + 1) / (count_w1 + V)
        else:
            count_bigram = bigram_counts.get((w1, w2), 0)
            if count_bigram == 0:
                return 0.0 
            count_w1 = unigram_counts[w1]
            prob *= count_bigram / count_w1
    return prob

In [9]:
unigram_counts, bigram_counts = train_bigram_model(training_corpus)
test_sentence = "I read a different book by Danielle"
unsmoothed_prob = sentence_probability(test_sentence, unigram_counts, bigram_counts, smoothing=False)
print("Unsmoothed Bigram Probability: {:.6f}".format(unsmoothed_prob))
V = len(unigram_counts)
smoothed_prob = sentence_probability(test_sentence, unigram_counts, bigram_counts, smoothing=True, V=V)
print("Smoothed Bigram Probability (Add-One): {:.6e}".format(smoothed_prob))

Unsmoothed Bigram Probability: 0.111111
Smoothed Bigram Probability (Add-One): 1.319181e-04
