**Tasks**

In Task 2, we successfully implemented N-gram models, which laid the groundwork for understanding how words and their sequences can be analyzed to predict and interpret language patterns. Building upon that foundational knowledge, this assignment focuses specifically on the application of a bigram (2-gram) model to calculate the probability of a given sentence.

1.   Implement Bigram Model Construction: Calculate and store the probability of each bigram based on the provided corpus.
2.   Calculate the Probability of a Given Sentence: Use the constructed bigram model to calculate the probability of a specific sentence.



In [None]:
def tokenize(text):
    # print(text)
    return text.lower().split()

def padSequence(sequence_, ngram_, is_pad_left_ = True, is_pad_right_ = True,
                pad_left_token_ = "<s>", pad_right_token_ = "</s>"):
    assert isinstance(sequence_, list), "sequence_ is not list"
    if (is_pad_left_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.insert(0, pad_left_token_)
            i += 1
    if (is_pad_right_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.append(pad_right_token_)
            i += 1
    return sequence_

def generate_ngrams(words, n):
    # TODO: Implement the logic to generate n-grams from the list of words
    words = padSequence(sequence_=words, ngram_=n)
    res = []
    res = [' '.join(words[i:i + n]) for i in range(len(words) - n + 1)]
    print(res)
    return res


def count_ngrams(ngrams):
    ngram_counts = {}
    for ngram in ngrams:
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    # print(ngram_counts)
    return ngram_counts

def calculate_ngram_probabilities(ngram_counts):
    total_ngrams = sum(ngram_counts.values())
    ngram_probabilities = {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
    return ngram_probabilities

def is_near(lhs, rhs, delta=0.01):
    return(abs(lhs - rhs) < delta)

def add_alpha_smoothing(unigram_counts, alpha, vocabulary_size):
    """
    Apply Add-α smoothing to the unigram model.
    TODO: Implement the calculation of smoothed probabilities.
    """
    smoothed_probabilities = {}
    # Your code here to calculate smoothed probabilities for each word
    # total_counts = sum(unigram_counts.values())
    n_token = len(unigram_counts.keys())
    # vocabulary_size_add_alpha = vocabulary_size + n_token * alpha
    vocabulary_size_add_alpha = sum(unigram_counts.values()) + n_token * alpha
    for unigram, count in unigram_counts.items():
        smoothed_prob = (count + alpha) / vocabulary_size_add_alpha
        smoothed_probabilities.update({unigram: smoothed_prob})
    # print(sum(smoothed_probabilities.values()))
    assert is_near(sum(smoothed_probabilities.values()),1), "error: sum(prob) != 1"
    return smoothed_probabilities

def build_bigram_model(corpus):
    """
    TODO: Build the bigram model
    - Count the occurrences of each bigram
    - Calculate the probability of each bigram based on counts
    """
    bigram_counts = {}
    unigram_counts = {}
    # Your code here to populate bigram_counts and unigram_counts
    assert isinstance(corpus, str), "error: input is not str"
    words = tokenize(corpus)
    bigrams = generate_ngrams(words=words, n = 2)
    bigram_counts = count_ngrams(bigrams)
    
    # Calculate bigram probabilities
    bigram_probabilities = {}
    # Your code here to calculate probabilities from counts
    bigram_probabilities = add_alpha_smoothing(bigram_counts, alpha=1,
                                               vocabulary_size=0)
    # print(bigram_probabilities)
    return bigram_probabilities

def build_unigram_model(corpus):
    """
    - Count the occurrences of each unigram
    - Calculate the probability of each unigram based on counts
    """
    unigram_counts = {}
    # Your code here to populate bigram_counts and unigram_counts
    assert isinstance(corpus, str), "error: input is not str"
    words = tokenize(corpus)
    unigrams = generate_ngrams(words=words, n = 1)
    unigram_counts = count_ngrams(unigrams)
    
    
    # Calculate bigram probabilities
    unigram_probabilities = {}
    # Your code here to calculate probabilities from counts
    unigram_probabilities = add_alpha_smoothing(unigram_counts, alpha=1,
                                                vocabulary_size=0)
    return unigram_probabilities

# def conditional_bigram_probabilities(bigram_probabilities, unigram_probabilities):
def conditional_bigram_probabilities(bigram_probabilities):
    conditional_probabilities = {}
    bigrams = bigram_probabilities.items()

    # calculate unigram probability
    unigram_list = [bigram.split() for bigram in bigram_probabilities.keys()]
    unigram_list = [bigram.split(sep = " ") for bigram in bigram_probabilities.keys()]
    print(unigram_list)
    unigram_list = list(zip(*unigram_list))
    print(unigram_list)
    unigram_list = unigram_list[0]
    unigram_probability = {}
    unigram_counts = count_ngrams(unigram_list)
    print(unigram_counts)
    # unigram_probabilities = add_alpha_smoothing(unigram_counts, alpha=1, vocabulary_size=0)
    unigram_probabilities = calculate_ngram_probabilities(unigram_counts)
    

    # calculate conditional probability   
    for bigram, prob in bigrams:
        bigram = bigram.split(sep = " ")
        lhs = bigram[0]
        # word_rhs = bigram[1]
        print(bigram[0], prob)
        conditional_prob = prob / unigram_probabilities[lhs]    
        print({' '.join(bigram): conditional_prob})
        conditional_probabilities.update({' '.join(bigram): conditional_prob})
    print(conditional_probabilities)
    return(conditional_probabilities)

# TODO here, add conditional probs
def calculate_sentence_probability(sentence, bigram_probabilities):
    """
    TODO: Calculate the probability of a sentence using the bigram model
    - Split the sentence into words
    - Calculate the probability of each bigram in the sentence and multiply them to get the sentence probability
    """
    probability = 1
    # Your code here to calculate the sentence probability
    # words = tokenize(sentence)
    unigram_probabilities = build_unigram_model(sentence)
    unigrams = list(unigram_probabilities.keys())
    conditional_probabilities = conditional_bigram_probabilities(bigram_probabilities)

    total_words = len(unigrams)
    bigrams = bigram_probabilities.keys()
    for i in range(total_words - 1):
        # # bigram = ' '.join(list((unigrams[i], unigrams[i+1])))
        # if bigram in bigrams:
        # # bigram = (bigrams[i], bigrams[i+1])
        #     probability *= bigram_probabilities[bigram]
        #     print({bigram: probability})
        # else:
        #     probability *= 1 / len(bigrams)
        #     # pass

        lhs = unigrams[i]
        rhs = unigrams[i+1]
        bigram = lhs + " " + rhs
        probability *= unigram_probabilities[lhs] * conditional_probabilities[bigram]
    # print(bigram_probabilities)
    # print(sum(bigram_probabilities.values()))
    return probability

# Example corpus
corpus = "this is an example sentence for the corpus it is just an example"
# Build the bigram model
bigram_probabilities = build_bigram_model(corpus)
print(bigram_probabilities)

# Calculate the probability of a given sentence
sentence = "that is an example"
# sentence = "this is an example sentence for the corpus it is just an"
probability = calculate_sentence_probability(sentence, bigram_probabilities)
print(f"Sentence Probability: {probability}")