In [1]:
import nltk
from nltk.util import bigrams
from collections import Counter

In [2]:
#  a string from the BeRP dataset
text = './transcript.txt' # Path to the BeRP dataset

In [20]:
# Tokenize the text
tokens = nltk.word_tokenize(text.lower())  # Lowercase to maintain consistency

In [21]:
# Generate bigrams
bi_grams = list(bigrams(tokens))

In [22]:
# Calculate frequency distributions for unigrams and bigrams
unigram_freq = Counter(tokens)
bigram_freq = Counter(bi_grams)

In [23]:
# Set discount factor
D = 0.75

In [24]:
# Compute continuation probability (number of unique preceding words for w2)
continuation_counts = Counter()
for (w1, w2) in bigram_freq.keys():
    continuation_counts[w2] += 1



In [25]:
# Total number of bigram types
total_bigrams = len(bigram_freq)


In [26]:
# Compute probabilities using Kneser-Ney Smoothing
bigram_probabilities_kn = {}
for (w1, w2), count in bigram_freq.items():
    # Discounted bigram probability
    prob_kn = max(count - D, 0) / unigram_freq[w1]

    # Continuation probability
    P_cont = continuation_counts[w2] / total_bigrams

    # Normalization factor Î»(w1)
    lambda_w1 = (D / unigram_freq[w1]) * len([w for (w, _) in bigram_freq if w == w1])

    # Final probability
    bigram_probabilities_kn[(w1, w2)] = prob_kn + lambda_w1 * P_cont

In [27]:
# Function to calculate sentence probability using Kneser-Ney
def calculate_sentence_probability(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    prob = 1.0
    for i in range(1, len(tokens)):
        w1 = tokens[i - 1]
        w2 = tokens[i]
        prob *= bigram_probabilities_kn.get((w1, w2), 1e-10)  # Small value if unseen
    return prob


In [28]:
### without Kneser-Ney Smoothing
# # Calculate bigram probabilities
# bigram_probabilities = {}
# for bigram, count in bigram_freq.items():
#     w1 = bigram[0]
#     w2 = bigram[1]
#     bigram_probabilities[(w1, w2)] = count / unigram_freq[w1]

In [29]:
### without Kneser-Ney Smoothing
# # Function to calculate sentence probability
# def calculate_sentence_probability(sentence):
#     tokens = nltk.word_tokenize(sentence.lower())
#     prob = 0
#     for i in range(1, len(tokens)):
#         w1 = tokens[i - 1]
#         w2 = tokens[i]
#         prob *= bigram_probabilities.get((w1, w2), 0)  # If bigram doesn't exist, set probability to 0
#     return prob

In [30]:
# Example sentences
sentence_1 = "show me all the Arabic food restaurants"
sentence_2 = "I am learning mathematics"

In [31]:
# Calculate sentence probabilities
prob_1 = calculate_sentence_probability(sentence_1)
prob_2 = calculate_sentence_probability(sentence_2)

print(f"Probability of Sentence 1: {prob_1}")
print(f"Probability of Sentence 2: {prob_2}")

Probability of Sentence 1: 1.0000000000000001e-60
Probability of Sentence 2: 1e-30
