In [22]:
import numpy as np
import matplotlib as plt
import nltk               # NLP toolkit
import re

nltk.download('punkt')    # Download the Punkt sentence tokenizer 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adithyashanker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
def read_corpus(filepath):
    f = open(filepath, "r")
    return (f.read())

In [24]:
corpus = read_corpus("Data/Kanye West Lyrics.txt")

In [25]:
def preprocess(corpus, n):
    corpus = corpus.lower()
    #print(corpus)
    corpus = re.sub(r"[^a-zA-Z0-9.?! \n]+", "", corpus)
    #print(corpus)
    sentences = corpus.split("\n")
    #print(sentences)
    tokens = []
    for sentence in sentences:
        tokenized_sentence = nltk.word_tokenize(sentence)
        tokens.append(tokenized_sentence)
    return tokens


In [26]:
def count_prob(tokens):
    count = {}
    for sentence in tokens:
        for token in sentence:
            count[token] = count.get(token,0)+1
    return count

counts = count_prob(corpus)

In [27]:
train_set = preprocess(corpus, 2)
counts = count_prob(train_set)

In [28]:
def  create_closed_vocab(counts, threshold=2):
    vocab = []
    for k,v in counts.items():
        if v >=threshold:
            vocab.append(k)
    return vocab

In [29]:
vocab = create_closed_vocab(counts)
print(vocab)

['chorus', 'sing', 'every', 'hour', 'til', 'the', 'power', 'minute', 'of', 'lord', 'second', 'comes', 'each', 'and', 'millisecond', 'down', 'we', 'need', 'you', 'oh', 'verse', 'let', 'everything', 'that', 'have', 'breath', 'praise', 'god', 'cause', 'when', 'glory', 'in', 'for', 'his', 'mighty', 'grace', 'yeah', 'falls', 'are', 'to', 'us', 'bridge', '1', 'kanye', 'west', 'is', 'king', 'soldiers', 'out', 'i', 'get', 'heavens', 'aint', 'got', 'ta', 'peek', 'over', 'keepin', 'perfect', 'scream', 'at', 'mean', 'im', 'just', 'focused', 'pour', 'lean', 'clean', 'soda', 'before', 'people', 'judge', 'they', 'did', 'same', 'thing', 'everybody', 'wanted', 'then', 'jesus', 'christ', 'say', 'week', 'start', 'on', 'but', 'strong', 'sunday', 'wont', 'be', 'any', 'man', 'john', 'ye', 'should', 'made', 'free', 'whom', 'son', 'set', 'indeed', 'he', 'saved', 'a', 'like', 'me', 'service', 'choir', 'hallelujah', 'wonderful', '2', 'if', 'woke', 'wake', 'up', 'with', 'kiss', 'make', 'even', 'cup', 'my', 'bro

In [30]:
print(train_set)

[['chorus'], ['sing', 'every', 'hour', 'every', 'hour', 'til', 'the', 'power'], ['every', 'minute', 'every', 'minute', 'of', 'the', 'lord'], ['every', 'second', 'every', 'second', 'comes'], ['sing', 'each', 'and', 'every', 'millisecond', 'down'], ['we', 'need', 'you', 'we', 'need', 'you', 'sing', 'til', 'the', 'power'], ['we', 'need', 'you', 'we', 'need', 'you', 'of', 'the', 'lord'], ['we', 'need', 'you', 'comes'], ['oh', 'we', 'need', 'you', 'down'], [], [], ['verse'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', 'let', 'everything', 'that', 'have', 'breath', 'praise', 'god'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', 'cause', 'when', 'we', 'sing', 'the', 'glory', 'of', 'the', 'lord', 'comes', 'down', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', 'praising', 't

In [31]:
def replace_oov_words(train_set, closed_vocab, unkown_token="<UNK>"):
    processed_train_set = train_set.copy()
    for sentence in processed_train_set:
        for i in range(len(sentence)):
            if sentence[i] not in closed_vocab:
                sentence[i] = unkown_token 
    return processed_train_set

In [32]:
processed_train_set = replace_oov_words(train_set, vocab)
print(processed_train_set)

[['chorus'], ['sing', 'every', 'hour', 'every', 'hour', 'til', 'the', 'power'], ['every', 'minute', 'every', 'minute', 'of', 'the', 'lord'], ['every', 'second', 'every', 'second', 'comes'], ['sing', 'each', 'and', 'every', 'millisecond', 'down'], ['we', 'need', 'you', 'we', 'need', 'you', 'sing', 'til', 'the', 'power'], ['we', 'need', 'you', 'we', 'need', 'you', 'of', 'the', 'lord'], ['we', 'need', 'you', 'comes'], ['oh', 'we', 'need', 'you', 'down'], [], [], ['verse'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', 'let', 'everything', 'that', 'have', 'breath', 'praise', 'god'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', 'cause', 'when', 'we', 'sing', 'the', 'glory', 'of', 'the', 'lord', 'comes', 'down', 'down'], ['sing', 'til', 'the', 'power', 'of', 'the', 'lord', 'comes', 'down', '<UNK>', 'the'

In [33]:
def count_n_grams(processed_train_set, n=3, start_token='<s>', end_token = '<e>'):
    n_grams = {}
    for sentence in processed_train_set:
        sentence = [start_token] * (n-1) + sentence + [end_token]
        for i in range(len(sentence)-(n-1)): # complete this line

            # Get the n-gram from i to i+n
            n_gram = ([sentence[i] for i in range(i,i+n)])
            n_gram = tuple(n_gram)
            
            n_grams[n_gram] = n_grams.get(n_gram,0)+1
    return n_grams
    
    


In [34]:
n_grams = count_n_grams(processed_train_set)
print(n_grams)

{('<s>', '<s>', 'chorus'): 254, ('<s>', 'chorus', '<e>'): 69, ('<s>', '<s>', 'sing'): 19, ('<s>', 'sing', 'every'): 3, ('sing', 'every', 'hour'): 3, ('every', 'hour', 'every'): 3, ('hour', 'every', 'hour'): 3, ('every', 'hour', 'til'): 3, ('hour', 'til', 'the'): 3, ('til', 'the', 'power'): 22, ('the', 'power', '<e>'): 8, ('<s>', '<s>', 'every'): 31, ('<s>', 'every', 'minute'): 3, ('every', 'minute', 'every'): 3, ('minute', 'every', 'minute'): 3, ('every', 'minute', 'of'): 3, ('minute', 'of', 'the'): 3, ('of', 'the', 'lord'): 23, ('the', 'lord', '<e>'): 7, ('<s>', 'every', 'second'): 3, ('every', 'second', 'every'): 3, ('second', 'every', 'second'): 3, ('every', 'second', 'comes'): 3, ('second', 'comes', '<e>'): 3, ('<s>', 'sing', 'each'): 3, ('sing', 'each', 'and'): 3, ('each', 'and', 'every'): 4, ('and', 'every', 'millisecond'): 3, ('every', 'millisecond', 'down'): 3, ('millisecond', 'down', '<e>'): 3, ('<s>', '<s>', 'we'): 211, ('<s>', 'we', 'need'): 11, ('we', 'need', 'you'): 19, ('

In [35]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts.get(previous_n_gram,0)
            
    denominator = previous_n_gram_count + k*vocabulary_size

    n_plus1_gram = previous_n_gram + (word,)
 
    n_plus1_gram_count = n_plus1_gram_counts.get(n_plus1_gram,0)
            
    numerator = n_plus1_gram_count+k
        
    probability = numerator/denominator
    
    
    return probability    


In [36]:
bigrams = count_n_grams(processed_train_set, 3)
trigrams = count_n_grams(processed_train_set, 4)
print(bigrams)

{('<s>', '<s>', 'chorus'): 254, ('<s>', 'chorus', '<e>'): 69, ('<s>', '<s>', 'sing'): 19, ('<s>', 'sing', 'every'): 3, ('sing', 'every', 'hour'): 3, ('every', 'hour', 'every'): 3, ('hour', 'every', 'hour'): 3, ('every', 'hour', 'til'): 3, ('hour', 'til', 'the'): 3, ('til', 'the', 'power'): 22, ('the', 'power', '<e>'): 8, ('<s>', '<s>', 'every'): 31, ('<s>', 'every', 'minute'): 3, ('every', 'minute', 'every'): 3, ('minute', 'every', 'minute'): 3, ('every', 'minute', 'of'): 3, ('minute', 'of', 'the'): 3, ('of', 'the', 'lord'): 23, ('the', 'lord', '<e>'): 7, ('<s>', 'every', 'second'): 3, ('every', 'second', 'every'): 3, ('second', 'every', 'second'): 3, ('every', 'second', 'comes'): 3, ('second', 'comes', '<e>'): 3, ('<s>', 'sing', 'each'): 3, ('sing', 'each', 'and'): 3, ('each', 'and', 'every'): 4, ('and', 'every', 'millisecond'): 3, ('every', 'millisecond', 'down'): 3, ('millisecond', 'down', '<e>'): 3, ('<s>', '<s>', 'we'): 211, ('<s>', 'we', 'need'): 11, ('we', 'need', 'you'): 19, ('

In [37]:
x = estimate_probability("chocolate", ("lyn", "drinks"), bigrams, trigrams, len(vocab))
print(x)

0.00034423407917383823


In [38]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>",  k=1.0):

    previous_n_gram = tuple(previous_n_gram)    
    vocabulary = vocabulary + [end_token, unknown_token]    
    vocabulary_size = len(vocabulary)    
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)  
        probabilities[word] = probability

    return probabilities

In [39]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ['<s>'] * (n-1) + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=k)
    suggestion = None
    max_prob = 0
    
    for word, prob in probabilities.items():
        if start_with and not word.startswith(start_with):
            continue
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    
    return suggestion, max_prob


In [40]:
def generate_sentence(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token='<e>', unknown_token="<unk>", k=1.0):
    prev_word = ""
    while True:
        next_word, _ = suggest_a_word(tokens, n_gram_counts, n_plus1_gram_counts, vocabulary)
        if next_word == end_token:
            break
        if prev_word == next_word:
            break
        prev_word = next_word
        tokens.append(next_word)
        print(tokens)
    return tokens

In [52]:
sent = generate_sentence([""], bigrams, trigrams, vocab)
print(sent)

['how', 'could']
['how', 'could', 'you']
['how', 'could', 'you', 'be']
['how', 'could', 'you', 'be', 'so']
['how', 'could', 'you', 'be', 'so', 'heartless']
['how', 'could', 'you', 'be', 'so', 'heartless', '?']
['how', 'could', 'you', 'be', 'so', 'heartless', '?']
