<a href="https://colab.research.google.com/github/AgentCodename47/NLP-stuff/blob/main/bigram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.util import ngrams
from collections import Counter
from fractions import Fraction

# Define the training set
training_set = ["The Arabian knights", "These are the fairy tales of the east", "The stories of the Arabian knights are translated in many languages"]

# Define the target sentence
target_sentence = "<s> The Arabian knights are the fairy tales of the east </s>"

# Define the vocabulary
vocab = set()
for sentence in training_set:
    tokens = nltk.word_tokenize(sentence.lower())
    vocab.update(tokens)
# Remove the '<s>' and '</s>' tokens from the vocabulary
vocab.discard('<s>')
vocab.discard('</s>')

# Calculate the bigram counts
bigram_counts = Counter()
for sentence in training_set:
    tokens = nltk.word_tokenize(sentence.lower())
    bigrams = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
    bigram_counts.update(bigrams)

# Calculate the bigram probabilities
bigram_probs = {}
for bigram, count in bigram_counts.items():
    prev_word = bigram[0]
    if prev_word not in bigram_probs:
        bigram_probs[prev_word] = {}
    total_count = sum(count for _, count in bigram_counts.items() if _[0] == prev_word)
    prob = Fraction(count, total_count)
    bigram_probs[prev_word][bigram[1]] = prob

# Print the bigram probabilities
for prev_word in bigram_probs:
    print("Bigram probabilities for previous word '{}':".format(prev_word))
    for next_word, prob in bigram_probs[prev_word].items():
        print("  P('{}' | '{}') = {}/{}".format(next_word, prev_word, prob.numerator, prob.denominator))

# Apply add-1 smoothing
add_k = 1
vocab_size = len(vocab)
for prev_word in bigram_probs:
    for next_word in vocab:
        if next_word not in bigram_probs[prev_word]:
            bigram_probs[prev_word][next_word] = Fraction(add_k, sum(count for _, count in bigram_counts.items() if _[0] == prev_word) + add_k * vocab_size)

# Calculate the probability of the target sentence
tokens = nltk.word_tokenize(target_sentence.lower())
bigrams = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
prob = Fraction(1, 1)
for bigram in bigrams:
    prev_word = bigram[0]
    next_word = bigram[1]
    if prev_word not in bigram_probs:
        # Handle the case when prev_word is not present in the bigram_probs dictionary
        prob *= Fraction(add_k, sum(count for _, count in bigram_counts.items() if _[0] == prev_word) + add_k * vocab_size)
    else:
        prob *= bigram_probs[prev_word][next_word]
        print("P('{}' | '{}') = {}/{}".format(next_word, prev_word, bigram_probs[prev_word].numerator, bigram_probs[prev_word][next_word].denominator))
print("Probability of the sentence '{}': {}/{}".format(target_sentence, prob.numerator, prob.denominator))


Bigram probabilities for previous word '<s>':
  P('the' | '<s>') = 2/3
  P('these' | '<s>') = 1/3
Bigram probabilities for previous word 'the':
  P('arabian' | 'the') = 2/5
  P('fairy' | 'the') = 1/5
  P('east' | 'the') = 1/5
  P('stories' | 'the') = 1/5
Bigram probabilities for previous word 'arabian':
  P('knights' | 'arabian') = 1/1
Bigram probabilities for previous word 'knights':
  P('</s>' | 'knights') = 1/2
  P('are' | 'knights') = 1/2
Bigram probabilities for previous word 'these':
  P('are' | 'these') = 1/1
Bigram probabilities for previous word 'are':
  P('the' | 'are') = 1/2
  P('translated' | 'are') = 1/2
Bigram probabilities for previous word 'fairy':
  P('tales' | 'fairy') = 1/1
Bigram probabilities for previous word 'tales':
  P('of' | 'tales') = 1/1
Bigram probabilities for previous word 'of':
  P('the' | 'of') = 1/1
Bigram probabilities for previous word 'east':
  P('</s>' | 'east') = 1/1
Bigram probabilities for previous word 'stories':
  P('of' | 'stories') = 1/1
Big

KeyError: ignored