**Tasks**

In Task 2, we successfully implemented N-gram models, which laid the groundwork for understanding how words and their sequences can be analyzed to predict and interpret language patterns. Building upon that foundational knowledge, this assignment focuses specifically on the application of a bigram (2-gram) model to calculate the probability of a given sentence.

1.   Implement Bigram Model Construction: Calculate and store the probability of each bigram based on the provided corpus.
2.   Calculate the Probability of a Given Sentence: Use the constructed bigram model to calculate the probability of a specific sentence.



In [4]:
# from importlib import import_module
# %run  "2.2 Building N-gram Language Models.py" import *

# function in 2.2
def process_text(text):
    text = text.lower()
    return text

def tokenize(text):
    # Replace punctuation marks with space + punctuation
    punctuation_pattern = r'([.,!?;:])'
    text = re.sub(punctuation_pattern, r' \1', text)
    print(text)
    return text.split()

def padSequence(sequence_, ngram_, is_pad_left_ = True, is_pad_right_ = True,
                pad_left_token_ = "<s>", pad_right_token_ = "</s>"):
    assert isinstance(sequence_, list), "sequence_ is not list"
    if (is_pad_left_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.insert(0, pad_left_token_)
            i += 1
    if (is_pad_right_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.append(pad_right_token_)
            i += 1
    return sequence_

def generate_ngrams(words, n):
    # TODO: Implement the logic to generate n-grams from the list of words
    # words = padSequence(sequence_=words, ngram_=n)
    res = []
    res = [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]
    # print(res)
    return res

def count_ngrams(ngrams):
    ngram_counts = {}
    for ngram in ngrams:
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    return ngram_counts

def is_near(lhs, rhs, delta=0.01):
    return(abs(lhs - rhs) < delta)

def calculate_ngram_probabilities(ngram_counts):
    total_ngrams = sum(ngram_counts.values())
    ngram_probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
    # print(sum(ngram_probabilities.values()))
    assert is_near(sum(ngram_probabilities.values()),1), "error: sum(prob) != 1"
    return ngram_probabilities

def query_ngram_probability(ngram, ngram_probabilities):
    # print(ngram_probabilities)
    # print(ngram_probabilities.get(ngram,1))
    return ngram_probabilities.get(ngram, 0)



# my_functions = import_module("2-1-maximum-likelihood-estimation-mle-implementation.ipynb")
def tokenize(text):
    return text.lower().split()
alpha = 1
def build_bigram_model(corpus):
    """
    TODO: Build the bigram model
    - Count the occurrences of each bigram
    - Calculate the probability of each bigram based on counts
    """
    bigram_counts = {}
    unigram_counts = {}
    # Your code here to populate bigram_counts and unigram_counts
    words = tokenize(corpus)
    n_words = len(words)
    unigrams = generate_ngrams(words, 1)
    unigram_counts = count_ngrams(unigrams)
    bigrams = generate_ngrams(words, 2)
    bigram_counts = count_ngrams(bigrams)

    print(unigram_counts)
    print(bigram_counts)

    # Calculate bigram probabilities
    bigram_probabilities = {}
    # Your code here to calculate probabilities from counts
    for bigram, count in bigram_counts.items():
        prev_word = bigram.split(sep = " ")[0]
        succ_word = bigram.split(sep = " ")[1]
        conditional_prob = count / unigram_counts[prev_word]
        bigram_probabilities.update({bigram: conditional_prob})
    print(bigram_probabilities)
    return bigram_probabilities



def calculate_sentence_probability(sentence, bigram_probabilities):
    """
    TODO: Calculate the probability of a sentence using the bigram model
    - Split the sentence into words
    - Calculate the probability of each bigram in the sentence and multiply them to get the sentence probability
    """
    words = tokenize(sentence)
    probability = 1
    # Your code here to calculate the sentence probability
    n_words = len(words)
    bigrams = bigram_probabilities.keys()
    unigrams = generate_ngrams(words, 1)
    unigram_counts = count_ngrams(unigrams)
    # for bigram in bigrams:
    for i in range(n_words - 1):
        prev_word = words[i]
        succ_word = words[i + 1]
        bigram = prev_word + " " + succ_word
        print(unigram_counts)
        cur_unigram_count = unigram_counts[prev_word]
        probability *= bigram_probabilities.get(bigram, alpha / (cur_unigram_count + cur_unigram_count * alpha))
        # print(bigram_probabilities[bigram])

    return probability

# Example corpus
corpus = "this is an example sentence for the corpus it is just an example"
# Build the bigram model
bigram_probabilities = build_bigram_model(corpus)

# Calculate the probability of a given sentence
sentence = "this is an example"
# sentece = corpus
# sentence = "oov is"
probability = calculate_sentence_probability(sentence, bigram_probabilities)
print(f"Sentence Probability: {probability}")


{'this': 1, 'is': 2, 'an': 2, 'example': 2, 'sentence': 1, 'for': 1, 'the': 1, 'corpus': 1, 'it': 1, 'just': 1}
{'this is': 1, 'is an': 1, 'an example': 2, 'example sentence': 1, 'sentence for': 1, 'for the': 1, 'the corpus': 1, 'corpus it': 1, 'it is': 1, 'is just': 1, 'just an': 1}
{'this is': 1.0, 'is an': 0.5, 'an example': 1.0, 'example sentence': 0.5, 'sentence for': 1.0, 'for the': 1.0, 'the corpus': 1.0, 'corpus it': 1.0, 'it is': 1.0, 'is just': 0.5, 'just an': 1.0}
{'this': 1, 'is': 1, 'an': 1, 'example': 1}
{'this': 1, 'is': 1, 'an': 1, 'example': 1}
{'this': 1, 'is': 1, 'an': 1, 'example': 1}
Sentence Probability: 0.5
