In [1]:
import string
import random
import time
from typing import List

In [2]:


class NgramModel(object):

    """
    Class init
    :param 'n': number of words in n-gram
    """
    def __init__(self, n):
        self.n = n
        # dictionary that keeps list of candidate words given context
        self.context = {}
        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    '''
    Text tokenizer
    :param 'text'   : Takes input sentence
    :return         : tokenized sentence
    '''
    def tokenize(self, text: str) -> List[str]:
        for punct in string.punctuation:
            text = text.replace(punct, ' '+punct+' ')
        t = text.split()
        return t

    """
    Get ngrams with tuple form
    :param 'n'      : n-gram size
    :param 'tokens' : tokenized sentence
    :return         : list of ngrams
    """
    def get_ngrams(self, n: int, tokens: list) -> list:
        # tokens.append('<END>')
        tokens = (n-1)*['<START>']+tokens
        l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i])
             for i in range(n-1, len(tokens))]
        return l

    """
    Updates Language Model
    :param 'sentence': input text
    """
    def update(self, sentence: str) -> None:
        n = self.n
        ngrams = self.get_ngrams(n, self.tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]


"""
:param 'n'      : number of words to be produced
:param 'path'   : text file path
:return         : generated text
"""
def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [8]:
if __name__ == "__main__":
    start = time.time()
    n = 3
    m = create_ngram_model(n, '10_Best_Things_to_Do_in_Tartu.txt')

    print(f'Language Model creating time: {time.time() - start}')
    start = time.time()

    # calculate n-gram pop
    # sort_ngram_dict = dict(
    #     sorted(m.ngram_counter.items(), key=lambda item: item[1], reverse=True)[:5])
    ngram_dict = {}
    for key, value in m.ngram_counter.items():
        # print(f'{key}:{value}')
        word_dict = {}
        for word in m.context[key[0]]:
            if word in word_dict.keys():
                word_dict[word] += 1
            else:
                word_dict[word] = 1
        sum_value = sum(word_dict.values())
        for w_key, w_value in word_dict.items():
            word_dict[w_key] = w_value/sum_value

        ngram_dict[key[0]] = word_dict

    print(ngram_dict)


Language Model creating time: 0.0069315433502197266
{('<START>', '<START>'): {'Although': 0.046875, 'The': 0.078125, 'In': 0.015625, 'Found': 0.015625, 'This': 0.0625, 'So': 0.015625, 'There’s': 0.046875, 'Raekoja': 0.015625, 'Along': 0.015625, 'One': 0.078125, 'At': 0.03125, 'Since': 0.015625, 'Regularly': 0.015625, 'With': 0.03125, 'Built': 0.03125, 'Home': 0.015625, 'Despite': 0.015625, 'A': 0.03125, 'John’s': 0.015625, 'After': 0.03125, 'Affectionately': 0.015625, 'Soup': 0.015625, 'Just': 0.015625, 'It': 0.015625, 'Today': 0.015625, 'Two': 0.015625, 'These': 0.015625, 'Established': 0.015625, 'As': 0.015625, 'Chances': 0.015625, 'While': 0.015625, 'Like': 0.015625, 'From': 0.015625, 'Perhaps': 0.015625, 'Graffiti': 0.015625, 'Other': 0.015625, 'Translating': 0.015625, 'Passing': 0.015625, 'Thanks': 0.015625, 'For': 0.015625, 'Elsewhere': 0.015625, 'Our': 0.015625, 'Its': 0.015625, '.': 0.015625}, ('<START>', 'Although'): {'most': 0.3333333333333333, 'the': 0.3333333333333333, 'it'

## Reference
[1] [Text Generation Using N-Gram Model](https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0)