# Bag Of Words

In [1]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word)
    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

def preprocess_text(text):
    cleaned = re.sub(r'\W+', ' ', text).lower()
    tokenized = word_tokenize(cleaned)
    normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
    return normalized

### Features Dictionary

In [2]:
# Create the Features dictionary by assigning an index to every word in the training document

def create_features_dictionary(documents):
    features_dictionary = {}
    merged = " ".join(documents)
    tokens = preprocess_text(merged)
    index = 0
    for token in tokens:
        if token not in features_dictionary.keys():
            features_dictionary[token] = index
            index += 1
    return features_dictionary, tokens

training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]

print(create_features_dictionary(training_documents)[0])

{'five': 0, 'fantastic': 1, 'fish': 2, 'fly': 3, 'off': 4, 'to': 5, 'find': 6, 'faraway': 7, 'function': 8, 'maybe': 9, 'another': 10, 'my': 11, 'with': 12, 'a': 13, 'please': 14}


In [7]:
# Build a BoW vector

def text_to_bow_vector(some_text, features_dictionary):
    bow_vector = [0] * len(features_dictionary)
    tokens = preprocess_text(some_text)
    for token in tokens:
      index = features_dictionary[token]
      bow_vector[index] += 1
    return bow_vector, tokens

features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}

text = "Another five fish find another faraway fish."
print(text_to_bow_vector(text, features_dictionary)[0])

[1, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0]


In [8]:
# It's better to convert tokens to bow_vector

def tokens_to_bow_vector(document_tokens, features_dictionary):
    bow_vector = [0] * len(features_dictionary)
    for token in document_tokens:
      if token in features_dictionary:
        feature_index = features_dictionary[token]
        bow_vector[feature_index] += 1
    return bow_vector

### Classifying text

We can use a Naive Bayes Classifier with our bags of words to classify text.

In [48]:
test_docs = ['I am in love with The Sun',
            'Love, love, love music, life',
            'All you need is love']
training_docs = ['I am in love with music', 
                 'Life is Love', 
                 'Cats are terrible people',
                 'love is the key to happiness', 
                 'The world is burning and I\'m eating icecream',
                 'I used to think that I should watch TV',
                 'I used think that it was good for me',
                 'We don\'t need no education',
                 'All you need is love']
training_labels = [1, 1, 0, 1, 0, 0, 0, 0, 1]

In [49]:
training_doc_tokens = [preprocess_text(text) for text in training_docs]

In [50]:
# Define bow_sms_dictionary:
bow_sms_dictionary = create_features_dictionary(training_docs)

# Define training_vectors:
training_vectors = [tokens_to_bow_vector(
    training_doc, bow_sms_dictionary) 
                    for training_doc in training_docs]

# Define test_vectors:
test_vectors = [tokens_to_bow_vector(
    test_doc, bow_sms_dictionary) for test_doc in test_docs]

In [51]:
# Classify
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(training_vectors, training_labels)
predictions = classifier.predict(test_vectors)

In [52]:
print(predictions)

[0 0 0]


## Using libraries

Amazing work! As is the case with many tasks in Python, there’s already a library that can do all of that work for you.

For ``text_to_bow()``, you can approximate the functionality with the ``collections`` module’s ``Counter()`` function:

In [53]:
from collections import Counter
 
tokens = ['another', 'five', 'fish', 'find', 'another', 'faraway', 'fish']
print(Counter(tokens))

Counter({'another': 2, 'fish': 2, 'five': 1, 'find': 1, 'faraway': 1})


For vectorization, you can use ``CountVectorizer`` from the machine learning library ``scikit-learn``. You can use ``fit()`` to train the features dictionary and then ``transform()`` to transform text into a vector:

Text preprocessing, tokenizing and filtering of stopwords are all included in ``sklearn.feature_extraction.CountVectorizer``, which builds a dictionary of features and transforms documents to feature vectors.

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
 
training_documents = ["Five fantastic fish flew off to find faraway functions.", "Maybe find another five fantastic fish?", "Find my fish with a function please!"]
test_text = ["Another five fish find another faraway fish."]
bow_vectorizer = CountVectorizer()
bow_vectorizer.fit(training_documents)
bow_vector = bow_vectorizer.transform(test_text)
print(bow_vector.toarray())

[[2 0 1 1 2 1 0 0 0 0 0 0 0 0 0]]


# Bi-grams

In [55]:
from nltk.util import ngrams
from collections import Counter

In [56]:
text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
tokens = preprocess_text(text)

In [57]:
# Bigram approach:
bigrams_prepped = ngrams(tokens, 2)
bigrams = Counter(bigrams_prepped)
print("Three most frequent word sequences and the number of occurrences according to Bigrams:")
print(bigrams.most_common(3))

Three most frequent word sequences and the number of occurrences according to Bigrams:
[(('it', 's'), 1), (('s', 'excite'), 1), (('excite', 'to'), 1)]


In [58]:
# Bag-of-Words approach:
bag_of_words = Counter(tokens)
most_common_three = bag_of_words.most_common(3)

print("\nThree most frequent words and number of occurrences according to Bag-of-Words:")
print(most_common_three)


Three most frequent words and number of occurrences according to Bag-of-Words:
[('fish', 4), ('fly', 3), ('day', 3)]
