In [1]:
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import defaultdict, Counter
import math

In [None]:
# Download required NLTK data
nltk.download('punkt')

In [2]:
class LanguageModel:
    def __init__(self, n=2):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.vocab = set()
    
    def train(self, corpus):
        # Tokenize corpus
        tokens = []
        for text in corpus:
            tokens.extend(word_tokenize(text.lower()))
        
        # Remove punctuation and add sentence boundaries
        tokens = [token for token in tokens if token.isalpha()]
        self.vocab = set(tokens)
        
        # Generate n-grams
        for i in range(1, self.n + 1):
            ngs = list(ngrams(tokens, i))
            self.ngram_counts[i] = Counter(ngs)
    
    def probability(self, sentence):
        tokens = word_tokenize(sentence.lower())
        tokens = [token for token in tokens if token.isalpha()]
        
        if len(tokens) < self.n:
            return 0.0
        
        prob = 1.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            context = tuple(tokens[i:i + self.n - 1])
            
            if self.n == 1:
                # Unigram probability
                count_ngram = self.ngram_counts[1][ngram]
                total_unigrams = sum(self.ngram_counts[1].values())
                prob *= (count_ngram + 1) / (total_unigrams + len(self.vocab))  # Add-one smoothing
            else:
                # N-gram probability with smoothing
                count_ngram = self.ngram_counts[self.n][ngram]
                count_context = self.ngram_counts[self.n - 1][context]
                prob *= (count_ngram + 1) / (count_context + len(self.vocab))
        
        return prob
    
    def perplexity(self, sentence):
        prob = self.probability(sentence)
        if prob == 0:
            return float('inf')
        return math.pow(1/prob, 1/len(sentence.split()))

def calculate_probability():
    # Sample corpus
    corpus = [
        "the cat sat on the mat",
        "the dog sat on the log",
        "cats and dogs are great pets",
        "the cat chased the dog",
        "dogs are loyal animals",
        "the cat sleeps on the sofa"
    ]
    
    # Create and train language models
    unigram_model = LanguageModel(n=1)
    bigram_model = LanguageModel(n=2)
    trigram_model = LanguageModel(n=3)
    
    unigram_model.train(corpus)
    bigram_model.train(corpus)
    trigram_model.train(corpus)
    
    # Test sentences
    test_sentences = [
        "the cat sat",
        "the dog chased",
        "cats are animals",
        "unknown words here"
    ]
    
    print("Probability Calculation Results:")
    print("=" * 60)
    
    for sentence in test_sentences:
        print(f"\nSentence: '{sentence}'")
        print(f"Unigram Probability: {unigram_model.probability(sentence):.6f}")
        print(f"Bigram Probability: {bigram_model.probability(sentence):.6f}")
        print(f"Trigram Probability: {trigram_model.probability(sentence):.6f}")
        print(f"Bigram Perplexity: {bigram_model.perplexity(sentence):.2f}")

In [3]:
if __name__ == "__main__":
    calculate_probability()

Probability Calculation Results:

Sentence: 'the cat sat'
Unigram Probability: 0.000814
Bigram Probability: 0.014652
Trigram Probability: 0.095238
Bigram Perplexity: 4.09

Sentence: 'the dog chased'
Unigram Probability: 0.000407
Bigram Probability: 0.005769
Trigram Probability: 0.050000
Bigram Perplexity: 5.58

Sentence: 'cats are animals'
Unigram Probability: 0.000090
Bigram Probability: 0.002632
Trigram Probability: 0.055556
Bigram Perplexity: 7.24

Sentence: 'unknown words here'
Unigram Probability: 0.000008
Bigram Probability: 0.003086
Trigram Probability: 0.055556
Bigram Perplexity: 6.87
