### Unsmoothed Bigram model 

In [1]:
# calculating bigram probability
def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram, 0))/(unigramCounts.get(word1, 0))
    return listOfProb

# corpus (Data_3)
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

# extract bigrams from a list of sentences
def extract_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        words = sentence.split()  # Include <s> and </s> tokens
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

# count unigrams and bigrams
unigrams = [word for sentence in corpus for word in sentence.split()]
bigrams = extract_bigrams(corpus)

unigram_counts = {word: unigrams.count(word) for word in unigrams}
bigram_counts = {bigram: bigrams.count(bigram) for bigram in bigrams}

# calculate the probability of the sentence
sentence = "<s> I read a book by Danielle </s>"
sentence_bigrams = extract_bigrams([sentence])

bigram_probabilities = calcBigramProb(sentence_bigrams, unigram_counts, bigram_counts)

# compute the probability 
sentence_probability = 1
for bigram in sentence_bigrams:
    sentence_probability *= bigram_probabilities.get(bigram, 0)

print("Probability of the sentence:", sentence_probability)


Probability of the sentence: 0.07407407407407407


### Smoothed Bigram model

In [2]:
def smoothedBigramProb(listOfBigrams, unigramCounts, bigramCounts, vocabulary_size):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        count_bigram = bigramCounts.get(bigram, 0)
        count_unigram = unigramCounts.get(word1, 0)
        # apply laplace smoothing
        prob = (count_bigram + 1) / (count_unigram + vocabulary_size + 1)
        listOfProb[bigram] = prob
    return listOfProb

#  corpus (Data_3)
corpus = [
    "<s> He read a book </s>",
    "<s> I read a different book </s>",
    "<s> He read a book by Danielle </s>"
]

# extract bigrams from a list of sentences
def extract_bigrams(sentences):
    bigrams = []
    for sentence in sentences:
        words = sentence.split()  # to include <s> and </s> tokens
        for i in range(len(words) - 1):
            bigrams.append((words[i], words[i+1]))
    return bigrams

# count unigrams and bigrams
unigrams = [word for sentence in corpus for word in sentence.split()]
bigrams = extract_bigrams(corpus)
vocabulary_size = len(set(unigrams))  # Vocabulary size for Laplace smoothing , in this case its 10

unigram_counts = {word: unigrams.count(word) for word in unigrams}
bigram_counts = {bigram: bigrams.count(bigram) for bigram in bigrams}

# calculate the probability of the sentence
sentence = "<s> I read a book by Danielle </s>"
sentence_bigrams = extract_bigrams([sentence])

smoothed_bigram_probabilities = smoothedBigramProb(sentence_bigrams, unigram_counts, bigram_counts, vocabulary_size)

# compute the probability 
sentence_probability = 1
for bigram in sentence_bigrams:
    sentence_probability *= smoothed_bigram_probabilities.get(bigram, 1/(vocabulary_size))  # Default to 1/V if bigram not found

print("Probability of the sentence:", sentence_probability)


Probability of the sentence: 5.784626775880419e-06
