**Tasks**

In Task 2, we successfully implemented N-gram models, which laid the groundwork for understanding how words and their sequences can be analyzed to predict and interpret language patterns. Building upon that foundational knowledge, this assignment focuses specifically on the application of a bigram (2-gram) model to calculate the probability of a given sentence.

1.   Implement Bigram Model Construction: Calculate and store the probability of each bigram based on the provided corpus.
2.   Calculate the Probability of a Given Sentence: Use the constructed bigram model to calculate the probability of a specific sentence.



In [None]:
def tokenize(text):
    return text.lower().split()

def padSequence(sequence_, ngram_, is_pad_left_ = True, is_pad_right_ = True,
                pad_left_token_ = "<s>", pad_right_token_ = "</s>"):
    assert isinstance(sequence_, list), "sequence_ is not list"
    if (is_pad_left_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.insert(0, pad_left_token_)
            i += 1
    if (is_pad_right_):
        i = 0
        while (i < ngram_ - 1):
            sequence_.append(pad_right_token_)
            i += 1
    return sequence_

def generate_ngrams(words, n):
    # TODO: Implement the logic to generate n-grams from the list of words
    words = padSequence(sequence_=words, ngram_=n)
    res = [tuple(words[i:i + n]) for i in range(len(words) - n + 1)]
    # res = dict(res)
    # res = {words[0]: words[1:] for word in words}
    return res

def count_ngrams(ngrams):
    ngram_counts = {}
    for ngram in ngrams:
        if ngram in ngram_counts:
            ngram_counts[ngram] += 1
        else:
            ngram_counts[ngram] = 1
    return ngram_counts

def calculate_ngram_probabilities(ngram_counts):
    total_ngrams = sum(ngram_counts.values())
    ngram_probabilities = {ngram: (count / total_ngrams) for ngram, count in ngram_counts.items()}
    return ngram_probabilities

def query_ngram_probability(ngram, ngram_probabilities):
    return ngram_probabilities.get(ngram, 0)

def build_bigram_model(corpus):
    """
    TODO: Build the bigram model
    - Count the occurrences of each bigram
    - Calculate the probability of each bigram based on counts
    """
    bigram_counts = {}
    unigram_counts = {}
    # Your code here to populate bigram_counts and unigram_counts
    words = tokenize(corpus)
    # pad_left_token = "<s>"
    # right_pad_token = "</s>"
    # words.insert(0, pad_left_token)
    # words.append(right_pad_token)
    # ngram = 2
    # bigram_counts = {words[i:i + ngram] for i in range(len(words) - ngram + 1)}
    # ngram = 1
    # unigram_counts = {words[i:i + ngram] for i in range(len(words) - ngram + 1)}
    bigram_tuples = generate_ngrams(words=words, n=2)
    bigrams = dict(bigram_tuples) 
    bigram_counts = count_ngrams(bigrams)
    print(bigrams)

    # Calculate bigram probabilities
    bigram_probabilities = {}
    # Your code here to calculate probabilities from counts
    # total_bigram_counts = len(bigram_counts.keys())
    # bigram_probabilities = {bigram: (count / total_bigram_counts) for bigram, count in bigram_counts.items()}
    bigram_probabilities = calculate_ngram_probabilities(bigram_counts)
    # unigram
    # unigrams = generate_ngrams(words=words, n = 1)
    # unigram_counts = count_ngrams(unigrams)
    print(zip(bigrams, bigram_counts, bigram_probabilities))
    return bigram_probabilities

def build_unigram_model(corpus):
    unigram_counts = {}
    words = tokenize(corpus)
    # pad_left_token = "<s>"
    # right_pad_token = "</s>"
    # words.insert(0, pad_left_token)
    # words.append(right_pad_token)
    # ngram = 2
    # bigram_counts = {words[i:i + ngram] for i in range(len(words) - ngram + 1)}
    # ngram = 1
    # unigram_counts = {words[i:i + ngram] for i in range(len(words) - ngram + 1)}
    unigram_tuples = generate_ngrams(words=words, n=1)
    unigrams = dict(unigram_tuples) 
    unigram_counts = count_ngrams(unigrams)
    print(unigrams)

    # Calculate bigram probabilities
    unigram_probabilities = {}
    # Your code here to calculate probabilities from counts
    # total_bigram_counts = len(bigram_counts.keys())
    # bigram_probabilities = {bigram: (count / total_bigram_counts) for bigram, count in bigram_counts.items()}
    unigram_probabilities = calculate_ngram_probabilities(unigrams)
    # unigram
    # unigrams = generate_ngrams(words=words, n = 1)
    # unigram_counts = count_ngrams(unigrams)
    print(zip(unigrams, unigram_counts, unigram_probabilities))
    return unigram_probabilities

def bigram_conditional_probilities(unigram_probabilities, bigram_probabilities):
    conditional_probabilities = {}
    for bigram in bigram_probabilities.keys():
        conditional_prob_rhs_lhs = bigram_probabilities.get(bigram, 1) / query_ngram_probability(bigram[0], 1)
        conditional_probabilities.update({bigram: conditional_prob_rhs_lhs})
    return conditional_probabilities

    # unigram_probabilities.get
    # query_ngram_probability(unigram_probabilities)
    # bigram_counts = count_ngrams(bigrams)
    # for bigram in bigram_counts:
        # conditional_probabilitys = bigram_counts[bigram] / sum(bigram_counts[key])

alpha = 1
def calculate_sentence_probability(sentence, bigram_probabilities):
    """
    TODO: Calculate the probability of a sentence using the bigram model
    - Split the sentence into words
    - Calculate the probability of each bigram in the sentence and multiply them to get the sentence probability
    """
    words = tokenize(sentence)
    words = padSequence(words, ngram_=2)
    bigram_tuples = generate_ngrams(words=words, n=2)
    bigrams = dict(bigram_tuples)
    probability = 1
    # Your code here to calculate the sentence probability
    bigram_probabilities = build_bigram_model(sentence)
    unigram_probabilities = build_unigram_model(sentence)
    conditional_probabilities = bigram_conditional_probilities(unigram_probabilities, bigram_conditional_probilities)
    for bigram in bigrams:
        probability *= conditional_probabilities[bigram]
        # word_probability = conditional_probabilities.get(bigram, alpha / (sum(unigram_counts.values()) + alpha * vocabulary_size))
    return probability

# Example corpus
corpus = "this is an example sentence for the corpus it is just an example"
# Build the bigram model
bigram_probabilities = build_bigram_model(corpus)

# Calculate the probability of a given sentence
sentence = "this is an example"
probability = calculate_sentence_probability(sentence, bigram_probabilities)
print(f"Sentence Probability: {probability}")
