In [1]:
import random
import json
import nltk
from nltk.corpus import reuters
from nltk import trigrams

Building a trigram model

In [None]:
def build_trigram_model(corpus):
    trigram_counts = {}
    for sentence in corpus.sents():
        for char1, char2, char3 in trigrams(sentence, pad_right=True, pad_left=True):
            if (char1, char2) not in trigram_counts:
                trigram_counts[(char1, char2)] = {}
            if char3 not in trigram_counts[(char1, char2)]:
                trigram_counts[(char1, char2)][char3] = 0
            trigram_counts[(char1, char2)][char3] += 1

    for char1_char2 in trigram_counts:
        total_count = float(sum(trigram_counts[char1_char2].values()))
        for char3 in trigram_counts[char1_char2]:
            trigram_counts[char1_char2][char3] /= total_count

    return trigram_counts

Building a function to generate text using a trigram model

In [4]:
def generate_text(model, seed_words, max_length=50):
    text = list(seed_words)
    sentence_finished = False
    while not sentence_finished and len(text) < max_length:
        r = random.random()
        accumulator = .0
        for word in model[tuple(text[-2:])].keys():
            accumulator += model[tuple(text[-2:])][word]
            if accumulator >= r:
                text.append(word)
                break
        if text[-2:] == [None, None]:
            sentence_finished = True
    return ' '.join([t for t in text if t])

Output the conditional probability distribution of the third character given the bigram 'he is' in the trigram model

In [15]:
corpus = reuters
trigram_model = build_trigram_model(corpus)
print(json.dumps(trigram_model[('he', 'is')], indent=3, sort_keys=True))

{
   "\"": 0.04918032786885246,
   "about": 0.01639344262295082,
   "also": 0.06557377049180328,
   "among": 0.01639344262295082,
   "attending": 0.01639344262295082,
   "barred": 0.01639344262295082,
   "comfortable": 0.01639344262295082,
   "concerned": 0.01639344262295082,
   "confident": 0.01639344262295082,
   "considering": 0.03278688524590164,
   "encouraged": 0.01639344262295082,
   "forecasting": 0.01639344262295082,
   "hardly": 0.01639344262295082,
   "likely": 0.01639344262295082,
   "looking": 0.01639344262295082,
   "not": 0.09836065573770492,
   "now": 0.03278688524590164,
   "offering": 0.04918032786885246,
   "one": 0.01639344262295082,
   "operating": 0.01639344262295082,
   "opposed": 0.01639344262295082,
   "optimistic": 0.04918032786885246,
   "pleased": 0.01639344262295082,
   "politically": 0.03278688524590164,
   "ready": 0.04918032786885246,
   "recommending": 0.01639344262295082,
   "sceptical": 0.03278688524590164,
   "seeking": 0.04918032786885246,
   "serio

Predict 5 sentences starting with 'he is' using the above trigram model and text generating functions

In [13]:
for i in range(5):
    generated_text = generate_text(trigram_model, ["he", "is"])
    print(f"Generated text {i+1}: {generated_text}\n")

Generated text 1: he is seeking offers to buy Pakistani manufactured goods .

Generated text 2: he is optimistic the approval of the International Coffee Organization council meeting , he added that he was inclined to favour joint ventures , to 11 mln stg and compares with 12 . 32 billion NOTE : 1986 net includes 15 . 6 mln dlrs was due entirely to the

Generated text 3: he is not receiving a considerable downturn at the mouth of the March intervention price ( 171 cts ) or below the outgoing 14 . 5 billion cubic feet , it said .

Generated text 4: he is politically the better for U . S . BASE RATE CUT NOT APPROPRIATE Prime Minister Jacques Chirac planned to increase the cumulative drop of 7 . 6 mln dlrs compared with 12 mln acres sown last season .

Generated text 5: he is offering to close in 1989 , when there was no immediate action is taken ."

