In [1]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [22]:
corpus = brown.words()
#converting everything to lowercase 
lower_case_corpus = [w.lower() for w in corpus]
vocab = set(lower_case_corpus)

In [3]:
print('Total words in Corpus: ' + str(len(lower_case_corpus)))
print('Vocab of the Corpus: ' + str(len(vocab)))

Total words in Corpus: 1161192
Vocab of the Corpus: 49815


In [4]:
bigram_counts = {}
trigram_counts = {}

#sliding through corpus to get bigram and trigram counts
for i in range(len(lower_case_corpus) - 2):
    #getting bigram and trigram at each slide
    bigram = (lower_case_corpus[i], lower_case_corpus[i+1])
    trigram = (lower_case_corpus[i], lower_case_corpus[i+1], lower_case_corpus[i+2])
    
    #keeping track of the bigram counts
    if bigram in bigram_counts.keys():
        bigram_counts[bigram] += 1
    else:
        bigram_counts[bigram] = 1
    
    #keeping track of trigram counts
    if trigram in trigram_counts.keys():
        trigram_counts[trigram] += 1
    else:
        trigram_counts[trigram] = 1

print("Example, count for bigram ('the', 'king') is: " + str(bigram_counts[('the', 'king')]))

Example, count for bigram ('the', 'king') is: 51


In [9]:
#function takes a sentence as input and suggests possible words that could follow up
def suggest_next_word(input_, bigram_counts, trigram_counts, vocab):
    #consider the last bigram of sentence
    tokenized_input = word_tokenize(input_.lower())
    last_bigram = tokenized_input[-2:]
    
    #calculate the chance for every word in the vocab
    vocab_probabilities = {}
    for vocab_word in vocab:
        test_trigram = (last_bigram[0], last_bigram[1], vocab_word)
        test_bigram = (last_bigram[0], last_bigram[1])

        test_trigram_count = trigram_counts.get(test_trigram, 0)
        test_bigram_count = bigram_counts.get(test_bigram, 0)
        
        probability = test_trigram_count / test_bigram_count
        vocab_probabilities[vocab_word] = probability
    
    #sorting to pick out the most probable words
    top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)[:3]
    return top_suggestions

In [25]:
suggest_next_word('I am', bigram_counts, trigram_counts, vocab)

[('not', 0.11594202898550725),
 ('a', 0.06280193236714976),
 ('sure', 0.04830917874396135)]

In [26]:
suggest_next_word('I am not', bigram_counts, trigram_counts, vocab)

[('a', 0.08), ('to', 0.08), ('comparing', 0.04)]

In [27]:
suggest_next_word('I am not a', bigram_counts, trigram_counts, vocab)

[('single', 0.03289473684210526),
 ('man', 0.02631578947368421),
 ('``', 0.019736842105263157)]

In [29]:
suggest_next_word('I am not a man', bigram_counts, trigram_counts, vocab)

[('who', 0.14130434782608695),
 ('of', 0.09782608695652174),
 ('with', 0.06159420289855073)]

In [30]:
suggest_next_word('I am not a man who', bigram_counts, trigram_counts, vocab)

[('had', 0.09090909090909091),
 ('was', 0.06818181818181818),
 ('could', 0.06818181818181818)]

In [31]:
suggest_next_word('I am not a man who could', bigram_counts, trigram_counts, vocab)

[('not', 0.2), ('afford', 0.1), ('see', 0.06666666666666667)]

In [32]:
suggest_next_word('I am not a man who could see', bigram_counts, trigram_counts, vocab)

[('the', 0.25287356321839083),
 ('that', 0.09195402298850575),
 ('a', 0.04597701149425287)]

In [33]:
#....