# Bag of Words Language Model

In [3]:
import nltk, re
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

In [4]:
# define the list with the stop words
stop_words = stopwords.words('english')
# instantiate WordNetlemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
def pos_tagging(word):
    """Tag each word with its part of speech."""
    
    # get the already tagged synonyms of the word 
    probable_pos = wordnet.synsets(word)
    # instantiate Counter()
    pos_counts = Counter()
    
    # count the POS of the word's synonyms
    pos_counts["n"] = len([synonym for synonym in probable_pos if synonym.pos()=="n"])
    pos_counts["v"] = len([synonym for synonym in probable_pos if synonym.pos()=="v"])
    pos_counts["a"] = len([synonym for synonym in probable_pos if synonym.pos()=="a"])
    pos_counts["r"] = len([synonym for synonym in probable_pos if synonym.pos()=="r"])
    
    # find the most common POS of the word's synonyms
    most_likely_pos = pos_counts.most_common(1)[0][0]
    
    return most_likely_pos

In [14]:
def preprocess_text(text):
    """
    1. Strips the text off punctuation.
    2. Lower-case letters
    3. Tokenize letters
    4. Lemmatize letters
    """
    # strip text off punctuation and lower-case letters
    cleaned = re.sub(r'\W+', ' ', text).lower()
    # tokenize text
    tokenized = word_tokenize(cleaned)
    # lemmatize text with POS
    normalized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized]
    
    return normalized

In [28]:
def text_to_bow(some_text):
    """
    Applies a Bag-of-Words language model to text.
    """
    bow_dictionary = {}
    # remove punctuation, lower-case, tokenize, lemmatize
    tokens = preprocess_text(some_text)
    for token in tokens:
        if token in bow_dictionary:
          bow_dictionary[token] += 1
        else:
          bow_dictionary[token] = 1
    return bow_dictionary

print(text_to_bow("""I love fantastic flying fish. These flying fish are just ok,
            so maybe I will find another few fantastic fish..."""))

{'i': 2, 'love': 1, 'fantastic': 2, 'fly': 2, 'fish': 3, 'these': 1, 'be': 1, 'just': 1, 'ok': 1, 'so': 1, 'maybe': 1, 'will': 1, 'find': 1, 'another': 1, 'few': 1}


In [29]:
from collections import Counter

text = """I love fantastic flying fish. These flying fish are just ok,
so maybe I will find another few fantastic fish..."""
# remove punctuation, lower-case, tokenize, lemmatize
tokens = preprocess_text(text)

print(Counter(tokens)) 

Counter({'fish': 3, 'i': 2, 'fantastic': 2, 'fly': 2, 'love': 1, 'these': 1, 'be': 1, 'just': 1, 'ok': 1, 'so': 1, 'maybe': 1, 'will': 1, 'find': 1, 'another': 1, 'few': 1})


When a BoW dictionary is not enough we convert the text into **BoW vectors**:
1. Create a **features dictionary** ('word': index)
2. Create a **BoW vector**

In [20]:
def create_features_dictionary(documents):
    """Create a features dictionary from documents."""
    
    features_dictionary = {}
    # join all text into one string with a space seperator
    merged = " ".join(documents)
    # remove punctuation, lower-case, tokenize, lemmatize
    tokens = preprocess_text(merged)
    index = 0
    for token in tokens:
        if token not in features_dictionary:
          features_dictionary[token] = index
          index += 1
    return features_dictionary, tokens

training_documents = ["Five fantastic fish flew off to find faraway functions.",
                          "Maybe find another five fantastic fish?",
                          "Find my fish with a function please!"]

# print features dictionary
print(create_features_dictionary(training_documents)[0])

{'five': 0, 'fantastic': 1, 'fish': 2, 'fly': 3, 'off': 4, 'to': 5, 'find': 6, 'faraway': 7, 'function': 8, 'maybe': 9, 'another': 10, 'my': 11, 'with': 12, 'a': 13, 'please': 14}


In [30]:
def text_to_bow_vector(some_text, features_dictionary):
    """Convert text and features dictionary to a Bag-of-Words vector."""
    
    # create a list of 0s the length of features dictionary
    bow_vector = [0] * len(features_dictionary)
    # remove punctuation, lower-case, tokenize, lemmatize
    tokens = preprocess_text(some_text)
    
    for token in tokens:
        feature_index = features_dictionary[token]
        bow_vector[feature_index] += 1
    return bow_vector, tokens

features_dictionary = {'function': 8, 'please': 14, 'find': 6, 'five': 0, 'with': 12, 'fantastic': 1, 'my': 11, 'another': 10, 'a': 13, 'maybe': 9, 'to': 5, 'off': 4, 'faraway': 7, 'fish': 2, 'fly': 3}

text = "Another five fish find another faraway fish."
print(text_to_bow_vector(text, features_dictionary)[0])

[1, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 0]


In [32]:
from sklearn.feature_extraction.text import CountVectorizer
 
training_documents = ["Five fantastic fish flew off to find faraway functions.",
                      "Maybe find another five fantastic fish?",
                      "Find my fish with a function please!"]

test_text = ["Another five fish find another faraway fish."]

# instantiate CountVectorizer
bow_vectorizer = CountVectorizer()
# fit CountVectorizer to the training data to generate a features dict
bow_vectorizer.fit(training_documents)
# transform features dict into a vector
bow_vector = bow_vectorizer.transform(test_text)

print(bow_vector.toarray())

[[2 0 1 1 2 1 0 0 0 0 0 0 0 0 0]]


# Term Frequency-Inverse Term Frequecy (tf-idf)
1. Term Frequency: how often a word appears in a document (same as BoW)
2. Inverse Document Frequency: a measure of how often a word appears in the overall corpus. 

By penalizing the score of words that appear throughout a corpus, tf-idf can give better insight into how important a word is to a particular document of a corpus.

In [52]:
poem_1 = '''
Success is counted sweetest
By those who ne'er succeed.
To comprehend a nectar
Requires sorest need.

Not one of all the purple host
Who took the flag to-day
Can tell the definition,
So clear, of victory,

As he, defeated, dying,
On whose forbidden ear
The distant strains of triumph
Break, agonized and clear!'''

poem_2 = '''
Wild nights! Wild nights!
Were I with thee,
Wild nights should be
Our luxury!

Futile the winds
To a heart in port, —
Done with the compass,
Done with the chart.

Rowing in Eden!
Ah! the sea!
Might I but moor
To-night in thee!'''

poem_3 = '''
I'm nobody! Who are you?
Are you nobody, too?
Then there 's a pair of us — don't tell!
They 'd banish us, you know.

How dreary to be somebody!
How public, like a frog
To tell your name the livelong day
To an admiring bog!'''

poem_4 = '''
I felt a funeral in my brain,
   And mourners, to and fro,
Kept treading, treading, till it seemed
   That sense was breaking through.

And when they all were seated,
   A service like a drum
Kept beating, beating, till I thought
   My mind was going numb.

And then I heard them lift a box,
   And creak across my soul
With those same boots of lead, again.
   Then space began to toll

As all the heavens were a bell,
   And Being but an ear,
And I and silence some strange race,
   Wrecked, solitary, here.'''

poem_5 = '''
Hope is the thing with feathers
That perches in the soul,
And sings the tune without the words,
And never stops at all,

And sweetest in the gale is heard;
And sore must be the storm
That could abash the little bird
That kept so many warm.

I 've heard it in the chillest land,
And on the strangest sea;
Yet, never, in extremity,
It asked a crumb of me.'''

poem_6 = '''
The pedigree of honey
Does not concern the bee;
A clover, any time, to him
Is aristocracy.'''

poems = [poem_1, poem_2, poem_3, poem_4, poem_5, poem_6]

## Part I: Term Frequency

In [61]:
import pandas as pd

# preprocess text
processed_poems = [preprocess_text(poem) for poem in poems]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies (term-doc matrix)
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)
print(df_term_frequencies)

         Poem 1  Poem 2  Poem 3  Poem 4  Poem 5  Poem 6
abash         0       0       0       0       1       0
across        0       0       0       1       0       0
admire        0       0       1       0       0       0
again         0       0       0       1       0       0
agonize       1       0       0       0       0       0
...         ...     ...     ...     ...     ...     ...
word          0       0       0       0       1       0
wreck         0       0       0       1       0       0
yet           0       0       0       0       1       0
you           0       0       3       0       0       0
your          0       0       1       0       0       0

[173 rows x 6 columns]


## Part II: Inverse Document Frequency
The intuition is that words that appear more frequently in the corpus give less insight into the topic or meaning of an individual document, and should thus be deprioritized.

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=corpus_index)
  print(df_tf_idf)
except:
  pass

           Poem 1  Poem 2    Poem 3    Poem 4    Poem 5  Poem 6
abash    0.000000     0.0  0.000000  0.000000  2.252763     0.0
across   0.000000     0.0  0.000000  2.252763  0.000000     0.0
admire   0.000000     0.0  2.252763  0.000000  0.000000     0.0
again    0.000000     0.0  0.000000  2.252763  0.000000     0.0
agonize  2.252763     0.0  0.000000  0.000000  0.000000     0.0
...           ...     ...       ...       ...       ...     ...
word     0.000000     0.0  0.000000  0.000000  2.252763     0.0
wreck    0.000000     0.0  0.000000  2.252763  0.000000     0.0
yet      0.000000     0.0  0.000000  0.000000  2.252763     0.0
you      0.000000     0.0  6.758289  0.000000  0.000000     0.0
your     0.000000     0.0  2.252763  0.000000  0.000000     0.0

[173 rows x 6 columns]


## Converting BoW into tf-idf

In [60]:
from sklearn.feature_extraction.text import TfidfTransformer

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies
df_bag_of_words = pd.DataFrame(bow_matrix.T.todense(), index=feature_names, columns=corpus_index)

# display term-document matrix of term frequencies (bag-of-words)
print(df_bag_of_words)

# initialize and fit TfidfTransformer, transform bag-of-words matrix
transformer = TfidfTransformer(norm=None)
tfidf_scores = transformer.fit_transform(bow_matrix)

# create pandas DataFrame with tf-idf scores
try:
  df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index = feature_names, columns=corpus_index)
  print(df_tf_idf)
except:
  pass

         Poem 1  Poem 2  Poem 3  Poem 4  Poem 5  Poem 6
abash         0       0       0       0       1       0
across        0       0       0       1       0       0
admire        0       0       1       0       0       0
again         0       0       0       1       0       0
agonize       1       0       0       0       0       0
...         ...     ...     ...     ...     ...     ...
word          0       0       0       0       1       0
wreck         0       0       0       1       0       0
yet           0       0       0       0       1       0
you           0       0       3       0       0       0
your          0       0       1       0       0       0

[173 rows x 6 columns]
           Poem 1  Poem 2    Poem 3    Poem 4    Poem 5  Poem 6
abash    0.000000     0.0  0.000000  0.000000  2.252763     0.0
across   0.000000     0.0  0.000000  2.252763  0.000000     0.0
admire   0.000000     0.0  2.252763  0.000000  0.000000     0.0
again    0.000000     0.0  0.000000  2.252763  0