# Using n-grams

In [2]:
import re

In [1]:
sentence = """Thomas Jefferson began building Monicello at the age of 26."""

In [6]:
pattern = re.compile(r'([-\s.,;!?]+)')
tokens = pattern.split(sentence)
tokens = list(filter(lambda x: x not in '- \t\n.,;!?', tokens))
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monicello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [26]:
"""
    Quick and dirty n-gram builder method to build n-grams of arbitary size.
    
"""
def n_gram(tokens, n_gram = 2):
    
    if n_gram < 1: 
        raise ArgumentError("Can't build n-gram of {}".format(n_gram))
    
    if n_gram == 1:
        return tokens
    
    aggregated = []
    # Build the actual n-gram
    for from_indx in range(len(tokens)):
        
        if (from_indx + n_gram) > len(tokens):
            break

        single_n_gram = []
        for token_indx in range(from_indx, from_indx + n_gram):
            single_n_gram += [tokens[token_indx]]
            
        aggregated.append(tuple(single_n_gram)) 
    
    return aggregated

In [23]:
# Building a 2-gram out of the tokens
result = n_gram(tokens)
result

[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monicello'),
 ('Monicello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [25]:
# Building a 3-gram out of the tokens
result = n_gram(tokens, 3)
result

[('Thomas', 'Jefferson', 'began'),
 ('Jefferson', 'began', 'building'),
 ('began', 'building', 'Monicello'),
 ('building', 'Monicello', 'at'),
 ('Monicello', 'at', 'the'),
 ('at', 'the', 'age'),
 ('the', 'age', 'of'),
 ('age', 'of', '26')]

In [31]:
# n-gram > amount of tokens
print("Number of tokens: {}".format(len(tokens)))
result = n_gram(tokens, 11)
result

Number of tokens: 10


[]

In [34]:
# n-grams easily joinable
[" ".join(x) for x in n_gram(tokens)]

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monicello',
 'Monicello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [38]:
# Filter stop words out of tokens
# Depending the library used, the stopwords may differ
stop_words = ['a', 'an', 'the', 'on', 'of', 'off', 'this', 'is']
tokens = ['the', 'house', 'is', 'on', 'fire']
tokens_wa_stop = list(filter(lambda token: token not in stop_words, tokens))
tokens_wa_stop

['house', 'fire']

## Exploring differnt normalization techniques
Important to reduce the vocabulary size, thus increasing the performance

In [None]:
# Case Folding (case normalization)
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = 