In [12]:
training_corpus = [
    "<s> He read a book </s>".split(),
    "<s> I read a different book </s>".split(),
    "<s> He read a book by Danielle </s>".split()
]
target_sentence = "<s> I read a book by Danielle </s>".split()


bigram_counts = {}   # holds values
unigram_counts = {}  # holds preceding values

In [13]:
# Loop over each sentence in the training corpus to update counts
for sentence in training_corpus:
    for i in range(len(sentence) - 1):
        prev_word = sentence[i]
        next_word = sentence[i + 1]
        if prev_word not in bigram_counts:
            bigram_counts[prev_word] = {}
        bigram_counts[prev_word][next_word] = bigram_counts[prev_word].get(next_word, 0) + 1
        unigram_counts[prev_word] = unigram_counts.get(prev_word, 0) + 1

In [14]:
# Calculate unsmoothed bigram probability for the target sentence
unsmoothed_prob = 1.0
print("Unsmoothed bigram probabilities:")
for i in range(len(target_sentence) - 1):
    prev_word = target_sentence[i]
    next_word = target_sentence[i + 1]
    count_bigram = 0
    if prev_word in bigram_counts:
        count_bigram = bigram_counts[prev_word].get(next_word, 0)
    count_prev = unigram_counts.get(prev_word, 0)
    prob = count_bigram / count_prev if count_prev != 0 else 0
    unsmoothed_prob *= prob
    print("P({}|{}) = {}/{} = {}".format(next_word, prev_word, count_bigram, count_prev, prob))
print("Total unsmoothed probability:", unsmoothed_prob)

Unsmoothed bigram probabilities:
P(I|<s>) = 1/3 = 0.3333333333333333
P(read|I) = 1/1 = 1.0
P(a|read) = 3/3 = 1.0
P(book|a) = 2/3 = 0.6666666666666666
P(by|book) = 1/3 = 0.3333333333333333
P(Danielle|by) = 1/1 = 1.0
P(</s>|Danielle) = 1/1 = 1.0
Total unsmoothed probability: 0.07407407407407407


In [17]:
# Calculate smoothed bigram probabilities
smoothed_prob = 1.0
vocab_size = 9
print("\nAdd-One Smoothed bigram probabilities:")
for i in range(len(target_sentence) - 1):
    prev_word = target_sentence[i]
    next_word = target_sentence[i + 1]
    count_bigram = 0
    if prev_word in bigram_counts:
        count_bigram = bigram_counts[prev_word].get(next_word, 0)
    count_prev = unigram_counts.get(prev_word, 0)
    prob = (count_bigram + 1) / (count_prev + vocab_size)
    smoothed_prob *= prob
    print("P({}|{}) = ({count_bigram}+1)/({count_prev}+{vocab_size}) = {prob}".format(
        next_word, prev_word, count_bigram=count_bigram, count_prev=count_prev, vocab_size=vocab_size, prob=prob))
print("\nTotal smoothed probability:", smoothed_prob)


Add-One Smoothed bigram probabilities:
P(I|<s>) = (1+1)/(3+9) = 0.16666666666666666
P(read|I) = (1+1)/(1+9) = 0.2
P(a|read) = (3+1)/(3+9) = 0.3333333333333333
P(book|a) = (2+1)/(3+9) = 0.25
P(by|book) = (1+1)/(3+9) = 0.16666666666666666
P(Danielle|by) = (1+1)/(1+9) = 0.2
P(</s>|Danielle) = (1+1)/(1+9) = 0.2

Total smoothed probability: 1.8518518518518515e-05
