In [1]:
import string
import random
import time
from typing import List

In [2]:
'''
Text tokenizer
:param 'text'   : Takes input sentence
:return         : tokenized sentence
''' 
def tokenize(text: str) -> List[str]:
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

"""
Get ngrams with tuple form
:param 'n'      : n-gram size
:param 'tokens' : tokenized sentence
:return         : list of ngrams
"""
def get_ngrams(n: int, tokens: list) -> list:
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i])
         for i in range(n-1, len(tokens))]
    return l


class NgramModel(object):

    """
    Class init
    :param 'n': number of words in n-gram
    """
    def __init__(self, n):
        self.n = n
        # dictionary that keeps list of candidate words given context
        self.context = {}
        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    """
    Updates Language Model
    :param 'sentence': input text
    """
    def update(self, sentence: str) -> None:
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    """
    Calculates probability of a candidate token to be generated given a context
    :param 'context':
    :param 'token'  :
    :return         : conditional probability
    """
    def prob(self, context, token):
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result
    
    """
    Given a context we "semi-randomly" select the next word to append in a sequence
    :param 'context':
    :return         :
    """
    def random_token(self, context):
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)
            
        print(map_to_probs)
        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    """
    :param 'token_count': number of words to be produced
    :return             : generated text
    """
    def generate_text(self, token_count: int):

        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)

"""
:param 'n'      : number of words to be produced
:param 'path'   : text file path
:return         : generated text
"""
def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [3]:
if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(2, '10_Best_Things_to_Do_in_Tartu.txt')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
    random.seed(7)
    print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(50))
    print(f'{"="*50}')

Language Model creating time: 0.010240554809570312
Generated text:
{'Although': 0.046875, 'The': 0.078125, 'In': 0.015625, 'Found': 0.015625, 'This': 0.0625, 'So': 0.015625, 'There’s': 0.046875, 'Raekoja': 0.015625, 'Along': 0.015625, 'One': 0.078125, 'At': 0.03125, 'Since': 0.015625, 'Regularly': 0.015625, 'With': 0.03125, 'Built': 0.03125, 'Home': 0.015625, 'Despite': 0.015625, 'A': 0.03125, 'John’s': 0.015625, 'After': 0.03125, 'Affectionately': 0.015625, 'Soup': 0.015625, 'Just': 0.015625, 'It': 0.015625, 'Today': 0.015625, 'Two': 0.015625, 'These': 0.015625, 'Established': 0.015625, 'As': 0.015625, 'Chances': 0.015625, 'While': 0.015625, 'Like': 0.015625, 'From': 0.015625, 'Perhaps': 0.015625, 'Graffiti': 0.015625, 'Other': 0.015625, 'Translating': 0.015625, 'Passing': 0.015625, 'Thanks': 0.015625, 'For': 0.015625, 'Elsewhere': 0.015625, 'Our': 0.015625, 'Its': 0.015625, '.': 0.015625}
{'in': 1.0}
{'Estonia': 0.07142857142857142, 'the': 0.2857142857142857, 'this': 0.07142857142857

## Reference
[1] [Text Generation Using N-Gram Model](https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0)