In [None]:
Implementation of decision rule-based Naïve Bayes disambiguation method to find the sense of an
ambiguous word with the given training set.

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from collections import defaultdict, Counter
import numpy as np

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Training data: "bank" used in two senses
training_data = [
    ("I need to savings money in the bank", "financial_institution"),
    ("I went to the bank to deposit money", "financial_institution"),
    ("They sat on the bank and fished", "shore"),
    ("We walked along the bank of the river", "shore"),
    ("The river bank was full of fish", "shore"),
    ("He is a well-known bank manager", "financial_institution"),
    ("We camped on the bank of the river", "shore"),
]

# Preprocessing function: remove stopwords, punctuation, lowercase
def preprocess(sent):
    sent = sent.replace('.', '')
    tokens = word_tokenize(sent)
    sw = set(stopwords.words('english'))
    data = [t.lower() for t in tokens if t.isalpha() and t.lower() not in sw]
    return data

# Function to get prior and conditional probabilities
def get_counts(training_data):
    class_counts = Counter()
    word_counts = {}
    total_words = defaultdict(int)

    for data, sense in training_data:
        if sense not in word_counts:
            word_counts[sense] = Counter()

    for data, sense in training_data:
        class_counts[sense] += 1
        words = preprocess(data)
        word_counts[sense].update(words)
        total_words[sense] += len(words)

    return class_counts, word_counts, total_words

# Calculate probabilities and print them
def print_probabilities(class_counts, word_counts, total_words):
    total_sentences = sum(class_counts.values())
    print(" Prior Probabilities:")
    for sense in class_counts:
        prior = class_counts[sense] / total_sentences
        print(f"  P({sense}) = {prior:.4f}")

    print("\n Conditional Probabilities with Laplace Smoothing:")
    vocabulary = set()
    for wc in word_counts.values():
        vocabulary.update(wc.keys())
    V = len(vocabulary)

    for sense in word_counts:
        print(f"\n🔸 For class '{sense}':")
        for word in vocabulary:
            prob = (word_counts[sense][word] + 1) / (total_words[sense] + V)
            print(f"  P({word}|{sense}) = {prob:.4f}")

# Test sentence classification
def testing(test, class_counts, word_counts, total_words):
    test_words = preprocess(test)
    best_sense = None
    max_prob = -np.inf
    vocabulary = set()
    for wc in word_counts.values():
        vocabulary.update(wc.keys())
    V = len(vocabulary)

    for sense in class_counts:
        sense_prob = np.log2(class_counts[sense] / sum(class_counts.values()))
        words_prob = 0
        for word in test_words:
            word_prob = (word_counts[sense][word] + 1) / (total_words[sense] + V)
            words_prob += np.log2(word_prob)
        total_prob = sense_prob + words_prob
        if total_prob > max_prob:
            max_prob = total_prob
            best_sense = sense

    return best_sense, max_prob

# Run the model
class_counts, word_counts, total_words = get_counts(training_data)

# Print all probabilities
print_probabilities(class_counts, word_counts, total_words)

# Classify test sentence
test = "She opened a savings account at the bank"
print("\nPrediction:")
result, prob = testing(test, class_counts, word_counts, total_words)
print(f"  → Predicted Sense: {result}\n  → Log Probability: {prob:.4f}")

 Prior Probabilities:
  P(financial_institution) = 0.4286
  P(shore) = 0.5714

 Conditional Probabilities with Laplace Smoothing:

🔸 For class 'financial_institution':
  P(need|financial_institution) = 0.0800
  P(went|financial_institution) = 0.0800
  P(walked|financial_institution) = 0.0400
  P(bank|financial_institution) = 0.1600
  P(savings|financial_institution) = 0.0800
  P(deposit|financial_institution) = 0.0800
  P(manager|financial_institution) = 0.0800
  P(money|financial_institution) = 0.1200
  P(along|financial_institution) = 0.0400
  P(sat|financial_institution) = 0.0400
  P(full|financial_institution) = 0.0400
  P(fish|financial_institution) = 0.0400
  P(camped|financial_institution) = 0.0400
  P(river|financial_institution) = 0.0400
  P(fished|financial_institution) = 0.0400

🔸 For class 'shore':
  P(need|shore) = 0.0345
  P(went|shore) = 0.0345
  P(walked|shore) = 0.0690
  P(bank|shore) = 0.1724
  P(savings|shore) = 0.0345
  P(deposit|shore) = 0.0345
  P(manager|shore) =

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adisa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
