In [None]:
def sentence_to_bigrams(sentence):
    """
    Add start '<s>' and stop '</s>' tags to the sentence and tokenize it into a list
    of lower-case words (sentence_tokens) and bigrams (sentence_bigrams)
    """
    sentences = [i.split() for i in sentence]
    start_end = [['<s>']+sentence+['</s>'] for sentence in sentences]
    sentence_tokens = [[word.lower() for word in sentence] for sentence in start_end]
    sentence_bigrams = [(sentence[i], sentence[i+1]) for sentence in sentence_tokens for i in range(len(sentence)-1)]
            
    return sentence_tokens, sentence_bigrams


def bigram_mle(tokens, bigrams):
    """
    Provide a dictionary of probabilities for all bigrams in a corpus of text
    the calculation is based on maximum likelihood estimation.
    Tag '<unk>' has been added for unknown probabilities.
    tokens: list of all tokens in the corpus
    bigrams: list of all two word tuples in the corpus
    return: key: tuple of two bigram words, in order OR <unk> key
            value: float probability

    """
    token_raw_counts = Counter(tokens)
    bigram_raw_counts = Counter(bigrams)
    big_dict = {k : v / token_raw_counts[k[0]] for k,v in bigram_raw_counts.items()}
    big_dict['<unk>'] = 0
    return big_dict


def bigram_smoothing(sentence):
    """
    Input is a string, provide a smoothed log probability dictionary based on Laplace,
    returning a dictionary of smoothed bigrams log probabilities including tags
    """

    tokens, bigrams = sentence_bigrams(sentence)
    token_counts = Counter(tokens)
    bigram_counts = Counter(bigrams)
    vocab_count = len(token_counts)

    smooth_dict = {k : np.log((v + 1) / (token_counts[k[0]] + vocab_count)) for k,v in bigram_counts.items()}
    smooth_dict['<unk>'] = np.log(1 / vocab_count)
    
    return smooth_dict


def log_prob(s_tokens, s_bigrams, bigram_log_dict):
    """ 
    Get log probability of a sentence
    """
    
    total_log_prob = 0
    for bg in s_bigrams:
        if bg in bigram_log_dict:
            total_log_prob += bigram_log_dict[bg]
        else:
            total_log_prob += bigram_log_dict['<unk>']
    
    return total_log_prob