In [1]:
import nltk
from nltk.util import ngrams

In [2]:
data = """a fish keeps for a day; a fish keeps well if you put it in a cold place. 
          a fish keeps best if you put it in the fridge. If you’ll be having fish, 
          have it with an apple because one apple a day keeps the doctor away! 
          You know they also say that a day keeps coming."""

In [8]:
def gram_extractor(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [" ".join(gram) for gram in n_grams]

In [13]:
# printing the first 5 tokens after gramification
print("\nWord-Level:")
print("1-gram: ", gram_extractor(data, 1)[0:10])
print()
print("2-gram: ", gram_extractor(data, 2)[0:10])
print()
print("3-gram: ", gram_extractor(data, 3)[0:10])
print()
print("4-gram: ", gram_extractor(data, 4)[0:10])


Word-Level:
1-gram:  ['a', 'fish', 'keeps', 'for', 'a', 'day', ';', 'a', 'fish', 'keeps']

2-gram:  ['a fish', 'fish keeps', 'keeps for', 'for a', 'a day', 'day ;', '; a', 'a fish', 'fish keeps', 'keeps well']

3-gram:  ['a fish keeps', 'fish keeps for', 'keeps for a', 'for a day', 'a day ;', 'day ; a', '; a fish', 'a fish keeps', 'fish keeps well', 'keeps well if']

4-gram:  ['a fish keeps for', 'fish keeps for a', 'keeps for a day', 'for a day ;', 'a day ; a', 'day ; a fish', '; a fish keeps', 'a fish keeps well', 'fish keeps well if', 'keeps well if you']


In [14]:
def char_gram_extractor(data, num):
    n_grams = ngrams(data, num)
    return [' '.join(gram) for gram in n_grams]

In [15]:
print("\nCharacter-Level:")
print("1-gram: ", char_gram_extractor(data, 1)[0:5])
print("2-gram: ", char_gram_extractor(data, 2)[0:5])
print("3-gram: ", char_gram_extractor(data, 3)[0:5])
print("4-gram: ", char_gram_extractor(data, 4)[0:5])
print("\n")


Character-Level:
1-gram:  ['a', ' ', 'f', 'i', 's']
2-gram:  ['a  ', '  f', 'f i', 'i s', 's h']
3-gram:  ['a   f', '  f i', 'f i s', 'i s h', 's h  ']
4-gram:  ['a   f i', '  f i s', 'f i s h', 'i s h  ', 's h   k']




## Skip-Grams
<b>While n-grams bring together contiguous sequences of words, skip-grams include grams that skip over certain terms!</b>

In [16]:
from nltk.util import skipgrams

In [17]:
def skipgram_exrtactor(data, n, k):
    skip_grams = skipgrams(nltk.word_tokenize(data), n, k)
    return [' '.join(skip) for skip in skip_grams]

In [19]:
data

'a fish keeps for a day; a fish keeps well if you put it in a cold place. \n          a fish keeps best if you put it in the fridge. If you’ll be having fish, \n          have it with an apple because one apple a day keeps the doctor away! \n          You know they also say that a day keeps coming.'

In [18]:
skipgram_exrtactor(data, 2, 3)

['a fish',
 'a keeps',
 'a for',
 'a a',
 'fish keeps',
 'fish for',
 'fish a',
 'fish day',
 'keeps for',
 'keeps a',
 'keeps day',
 'keeps ;',
 'for a',
 'for day',
 'for ;',
 'for a',
 'a day',
 'a ;',
 'a a',
 'a fish',
 'day ;',
 'day a',
 'day fish',
 'day keeps',
 '; a',
 '; fish',
 '; keeps',
 '; well',
 'a fish',
 'a keeps',
 'a well',
 'a if',
 'fish keeps',
 'fish well',
 'fish if',
 'fish you',
 'keeps well',
 'keeps if',
 'keeps you',
 'keeps put',
 'well if',
 'well you',
 'well put',
 'well it',
 'if you',
 'if put',
 'if it',
 'if in',
 'you put',
 'you it',
 'you in',
 'you a',
 'put it',
 'put in',
 'put a',
 'put cold',
 'it in',
 'it a',
 'it cold',
 'it place',
 'in a',
 'in cold',
 'in place',
 'in .',
 'a cold',
 'a place',
 'a .',
 'a a',
 'cold place',
 'cold .',
 'cold a',
 'cold fish',
 'place .',
 'place a',
 'place fish',
 'place keeps',
 '. a',
 '. fish',
 '. keeps',
 '. best',
 'a fish',
 'a keeps',
 'a best',
 'a if',
 'fish keeps',
 'fish best',
 'fish 

## Language Models
A language model refers to any method that assign probabilities to strings.
Let's try to develop a more concrete understanding of language models by building a simple one together. More specifically, <b>let's build a model that tries to predict the next word in a sentence, given the last word we saw</b>. One approach to accomplish this is to simply count, for each unique word, what words tend to follow it.
To do that, let's start by using the extract_word_ngrams tool we showed earlier, and collect all the unique words in the corpus:

In [20]:
unigrams = gram_extractor(data, 1)

In [22]:
len(unigrams)

66

In [23]:
vocabulary = list(set(unigrams))

In [27]:
print(f"Vocabulary : n = ({len(vocabulary)})")
print(vocabulary)

Vocabulary : n = (40)
['it', 'If', 'the', 'place', ',', 'in', 'well', 'best', 'coming', 'an', '!', 'also', 'doctor', 'having', 'away', 'say', 'be', 'll', 'apple', 'know', 'keeps', 'fish', 'if', '’', ';', 'for', 'put', '.', 'have', 'fridge', 'because', 'a', 'they', 'You', 'that', 'day', 'you', 'one', 'cold', 'with']


### Now that we have a list of the words, lets initialize a JSON object called counts that will keep track of the next word, given the current word.

In [28]:
counts = {}
for given_word in vocabulary:
    counts[given_word] = {}
    for next_word in vocabulary:
        counts[given_word][next_word] = 0

In [30]:
counts['fish']

{'it': 0,
 'If': 0,
 'the': 0,
 'place': 0,
 ',': 0,
 'in': 0,
 'well': 0,
 'best': 0,
 'coming': 0,
 'an': 0,
 '!': 0,
 'also': 0,
 'doctor': 0,
 'having': 0,
 'away': 0,
 'say': 0,
 'be': 0,
 'll': 0,
 'apple': 0,
 'know': 0,
 'keeps': 0,
 'fish': 0,
 'if': 0,
 '’': 0,
 ';': 0,
 'for': 0,
 'put': 0,
 '.': 0,
 'have': 0,
 'fridge': 0,
 'because': 0,
 'a': 0,
 'they': 0,
 'You': 0,
 'that': 0,
 'day': 0,
 'you': 0,
 'one': 0,
 'cold': 0,
 'with': 0}

In [40]:
for i in range(len(unigrams)-1):
    counts[unigrams[i]][unigrams[i+1]] += 1

In [42]:
counts['a']

{'it': 0,
 'If': 0,
 'the': 0,
 'place': 0,
 ',': 0,
 'in': 0,
 'well': 0,
 'best': 0,
 'coming': 0,
 'an': 0,
 '!': 0,
 'also': 0,
 'doctor': 0,
 'having': 0,
 'away': 0,
 'say': 0,
 'be': 0,
 'll': 0,
 'apple': 0,
 'know': 0,
 'keeps': 0,
 'fish': 3,
 'if': 0,
 '’': 0,
 ';': 0,
 'for': 0,
 'put': 0,
 '.': 0,
 'have': 0,
 'fridge': 0,
 'because': 0,
 'a': 0,
 'they': 0,
 'You': 0,
 'that': 0,
 'day': 3,
 'you': 0,
 'one': 0,
 'cold': 1,
 'with': 0}

In [43]:
# initialize the probabilites JSON object
probs = {}
for given_word in vocabulary:
    probs[given_word] = {}
    for next_word in vocabulary:
        probs[given_word][next_word] = 0

# convert the counts to probabilites
for key, value in counts.items():
    denominator = 0
    for key2, value2 in counts[key].items():
        denominator += value2

    for key2, value2 in counts[key].items():
        probs[key][key2] = value2 / denominator

In [45]:
probs['a']

{'it': 0.0,
 'If': 0.0,
 'the': 0.0,
 'place': 0.0,
 ',': 0.0,
 'in': 0.0,
 'well': 0.0,
 'best': 0.0,
 'coming': 0.0,
 'an': 0.0,
 '!': 0.0,
 'also': 0.0,
 'doctor': 0.0,
 'having': 0.0,
 'away': 0.0,
 'say': 0.0,
 'be': 0.0,
 'll': 0.0,
 'apple': 0.0,
 'know': 0.0,
 'keeps': 0.0,
 'fish': 0.42857142857142855,
 'if': 0.0,
 '’': 0.0,
 ';': 0.0,
 'for': 0.0,
 'put': 0.0,
 '.': 0.0,
 'have': 0.0,
 'fridge': 0.0,
 'because': 0.0,
 'a': 0.0,
 'they': 0.0,
 'You': 0.0,
 'that': 0.0,
 'day': 0.42857142857142855,
 'you': 0.0,
 'one': 0.0,
 'cold': 0.14285714285714285,
 'with': 0.0}

And there you have it! You have built your first data-driven language model. Congratulations! Now let's get to work doing some fun things with the language model, like simulating new sentences. Let's do that by choosing a word, and then selecting the next word in accordance to the probability given by our language model:

In [47]:
import numpy as np

def sample_next_gram_from_language_model(probs, given_token):
    distribution            = list(probs[given_token].values())
    sample_from_multinomial = np.random.multinomial(1,distribution)
    sample_index            = np.where(sample_from_multinomial==1)[0][0]
    word_keys               = list(probs[given_token].keys())
    next_word               = word_keys[sample_index]
    return(next_word)


In [50]:
given_token = "a"
next_token  = sample_next_gram_from_language_model(probs,given_token)

print(' Given the token :  ' + given_token)
print(' The next token is :  ' + next_token)
print('\n')

 Given the token :  a
 The next token is :  fish




In [56]:
def create_new_sentence(length, seed_token):
    tokens = [seed_token]
    for i in range(length):
        tokens.append(sample_next_gram_from_language_model(probs,tokens[-1]))  
    return tokens
      

In [59]:
print('---------------------------------------')
print(' '.join(create_new_sentence(10, 'a')))
print(' '.join(create_new_sentence(10, 'a')))
print(' '.join(create_new_sentence(10, 'a')))

---------------------------------------
a fish , have it in a fish , have it
a day keeps the fridge . If you put it in
a day ; a fish , have it in the fridge
