In [None]:
import nltk
import random
from collections import defaultdict, Counter

In [None]:
nltk.download('punkt_tab')
nltk.download('gutenberg')
nltk.download('punkt')

In [None]:
class NGramLM:
    def __init__(self, n=5):
        self.n = n
        self.model = defaultdict(Counter)

    def train(self, words):
        words = [w.lower() for w in words]
        for i in range(len(words) - self.n + 1):
            context = tuple(words[i:i + self.n - 1])
            target = words[i + self.n - 1]
            self.model[context][target] += 1

    def generate(self, seed, max_words=25):
        result = nltk.word_tokenize(seed.lower())
        for _ in range(max_words):
            found = False
            for size in range(self.n - 1, 0, -1):
                context = tuple(result[-size:])
                if context in self.model:
                    choices = self.model[context]
                    result.append(random.choices(list(choices.keys()), weights=choices.values())[0])
                    found = True
                    break
            if not found:
                context = random.choice(list(self.model.keys()))
                choices = self.model[context]
                result.append(random.choice(list(choices.keys())))
        return " ".join(result).capitalize()

In [None]:
raw = nltk.corpus.gutenberg.words("austen-emma.txt")
model = NGramLM(n=5)
model.train(raw)

In [None]:
prompts = ["The day was very", "Deep into that", "It was a"]
for p in prompts:
    print(f"Input: {p}\nOutput: {model.generate(p)}\n")