In [1]:
import string
import random
import time
from typing import List

In [2]:


class NgramModel(object):

    """
    Class init
    :param 'n': number of words in n-gram
    """
    def __init__(self, n):
        self.n = n
        # dictionary that keeps list of candidate words given context
        self.context = {}
        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    '''
    Text tokenizer
    :param 'text'   : Takes input sentence
    :return         : tokenized sentence
    '''
    def tokenize(self, text: str) -> List[str]:
        for punct in string.punctuation:
            text = text.replace(punct, ' '+punct+' ')
        t = text.split()
        return t

    """
    Get ngrams with tuple form
    :param 'n'      : n-gram size
    :param 'tokens' : tokenized sentence
    :return         : list of ngrams
    """
    def get_ngrams(self, n: int, tokens: list) -> list:
        # tokens.append('<END>')
        tokens = (n-1)*['<START>']+tokens
        l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i])
             for i in range(n-1, len(tokens))]
        return l

    """
    Updates Language Model
    :param 'sentence': input text
    """
    def update(self, sentence: str) -> None:
        n = self.n
        ngrams = self.get_ngrams(n, self.tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]


"""
:param 'n'      : number of words to be produced
:param 'path'   : text file path
:return         : generated text
"""
def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [3]:
"""
Calculates probability of a candidate token to be generated given a context
:param 'context':
:param 'token'  :
:return         : conditional probability
"""
def prob(self, context, token):
    try:
        count_of_token = self.ngram_counter[(context, token)]
        count_of_context = float(len(self.context[context]))
        result = count_of_token / count_of_context
    except KeyError:
        result = 0.0
    return result

"""
Given a context we "semi-randomly" select the next word to append in a sequence
:param 'context':
:return         :
"""
def random_token(self, context):
    r = random.random()
    map_to_probs = {}
    token_of_interest = self.context[context]
    for token in token_of_interest:
        map_to_probs[token] = self.prob(context, token)
    summ = 0
    for token in sorted(map_to_probs):
        summ += map_to_probs[token]
        if summ > r:
            return token


"""
:param 'token_count': number of words to be produced
:return             : generated text
"""
def generate_text(self, token_count: int):
    n = self.n
    context_queue = (n - 1) * ['<START>']
    result = []
    for _ in range(token_count):
        obj = self.random_token(tuple(context_queue))
        result.append(obj)
        if n > 1:
            context_queue.pop(0)
            if obj == '.':
                context_queue = (n - 1) * ['<START>']
            else:
                context_queue.append(obj)
    return ' '.join(result)


In [27]:
if __name__ == "__main__":
    start = time.time()
    n = 2
    m = create_ngram_model(n, '10_Best_Things_to_Do_in_Tartu.txt')

    print(f'Language Model creating time: {time.time() - start}')
    start = time.time()

    # calculate n-gram pop
    sort_ngram_dict = dict(
        sorted(m.ngram_counter.items(), key=lambda item: item[1], reverse=True)[:5])
    for key, value in sort_ngram_dict.items():
        # print(f'{key}:{value}')
        word_dict = {}
        for word in m.context[key[0]]:
            if word in word_dict.keys():
                word_dict[word] += 1
            else:
                word_dict[word] = 1
        print(key[0], word_dict)
    # print(f'{"="*50}\nGenerated text:')
    # print(m.generate_text(50))
    # print(f'{"="*50}')


Language Model creating time: 0.0067691802978515625
('of',) {'the': 23, 'a': 1, 'elegance': 1, 'refined': 1, 'Tartu': 6, 'tourist': 1, 'government': 1, 'pink': 1, 'old': 2, 'these': 1, 'Toome': 2, 'Tartu’s': 3, 'mystery': 1, 'science': 1, 'this': 1, 'street': 1, 'buildings': 1, 'fun': 1, '19th': 1, 'Eduard': 1}
('the',) {'capital': 1, 'only': 2, 'oldest': 4, '11th': 1, 'country’s': 1, 'city': 10, 'ever': 1, 'most': 3, 'centre': 5, 'heart': 1, 'city’s': 1, 'Emajõgi': 2, 'square': 1, 'beloved': 1, 'fountain': 1, 'Town': 2, 'local': 2, 'previous': 1, 'design': 1, 'least': 1, 'world': 1, 'country': 2, 'church': 1, '14th': 1, 'nearly': 1, 'viewing': 1, 'more': 2, 'Supilinn': 1, 'university': 1, '7th': 1, 'hill': 1, 'Angel’s': 1, 'first': 1, 'old': 1, '13th': 1, 'Livonian': 1, 'cathedral': 1, 'University': 2, 'history': 1, 'museum': 1, 'best': 5, 'beating': 1, 'University’s': 2, 'elegant': 1, 'grand': 1, 'university’s': 1, '19th': 1, 'main': 1, 'Old': 1, 'UNESCO': 1, 'creative': 1, 'strong':

## Reference
[1] [Text Generation Using N-Gram Model](https://towardsdatascience.com/text-generation-using-n-gram-model-8d12d9802aa0)