In [1]:
import nltk
import re
from nltk.corpus import brown
from collections import Counter, defaultdict
nltk.download('brown')
nltk.download('punkt')

sentences = brown.sents(categories='news')

cleaned_sentences = []
for sent in sentences:
    words = [re.sub(r'[^a-z]', '', w.lower()) for w in sent]
    words = [w for w in words if w]
    cleaned_sentences.append(words)

tokens = [word for sent in cleaned_sentences for word in sent]

print("Total number of sentences:", len(cleaned_sentences))
print("Total number of words:", len(tokens))
print("Vocabulary size:", len(set(tokens)))

def unigrams(tokens):
    return list(tokens)

def bigrams(tokens):
    return list(zip(tokens[:-1], tokens[1:]))

def trigrams(tokens):
    return list(zip(tokens[:-2], tokens[1:-1], tokens[2:]))

bigram_counts = Counter(bigrams(tokens))
trigram_counts = Counter(trigrams(tokens))

print("\nTop 20 Bigrams:")
for pair, freq in bigram_counts.most_common(20):
    print(pair, ":", freq)

print("\nTop 20 Trigrams:")
for trip, freq in trigram_counts.most_common(20):
    print(trip, ":", freq)

def bigram_prob(w1, w2, tokens):
    bigram_count = bigram_counts[(w1, w2)]
    unigram_count = tokens.count(w1)
    if unigram_count == 0:
        return 0
    return bigram_count / unigram_count

def trigram_prob(w1, w2, w3, tokens):
    trigram_count = trigram_counts[(w1, w2, w3)]
    bigram_count = bigram_counts[(w1, w2)]
    if bigram_count == 0:
        return 0
    return trigram_count / bigram_count

print("\nP('the' | 'in') =", bigram_prob("in", "the", tokens))
print("P('company' | 'of', 'the') =", trigram_prob("of", "the", "company", tokens))

def sentence_prob_bigram(sentence, tokens):
    words = sentence.lower().split()
    prob = 1
    for i in range(len(words) - 1):
        p = bigram_prob(words[i], words[i+1], tokens)
        prob *= p if p > 0 else 1e-6
    return prob

def sentence_prob_trigram(sentence, tokens):
    words = sentence.lower().split()
    prob = 1
    for i in range(len(words) - 2):
        p = trigram_prob(words[i], words[i+1], words[i+2], tokens)
        prob *= p if p > 0 else 1e-6
    return prob

test_sentence = "the president of the company"
print("\nSentence:", test_sentence)
print("Bigram Probability:", sentence_prob_bigram(test_sentence, tokens))
print("Trigram Probability:", sentence_prob_trigram(test_sentence, tokens))

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Total number of sentences: 4623
Total number of words: 87004
Vocabulary size: 12132

Top 20 Bigrams:
('of', 'the') : 850
('in', 'the') : 610
('to', 'the') : 279
('on', 'the') : 254
('for', 'the') : 223
('at', 'the') : 199
('will', 'be') : 157
('that', 'the') : 149
('with', 'the') : 142
('and', 'the') : 141
('in', 'a') : 120
('of', 'a') : 119
('by', 'the') : 115
('to', 'be') : 108
('from', 'the') : 104
('for', 'a') : 101
('as', 'a') : 91
('has', 'been') : 87
('the', 'first') : 85
('he', 'said') : 80

Top 20 Trigrams:
('one', 'of', 'the') : 44
('mr', 'and', 'mrs') : 42
('the', 'united', 'states') : 37
('members', 'of', 'the') : 28
('president', 'of', 'the') : 22
('a', 'number', 'of') : 19
('the', 'white', 'house') : 19
('as', 'a', 'result') : 18
('some', 'of', 'the') : 18
('the', 'u', 's') : 17
('chairman', 'of', 'the') : 16
('per', 'cent', 'of') : 15
('a', 'member', 'of') : 15
('in', 'the', 'past') : 14
('of', 'the', 'new') : 14
('the', 'new', 'york') : 14
('will', 'be', 'the') : 13
('t