In [1]:
# Import the dependencies
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
# Initialize the stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /Users/tberton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define the sentences.
sentence_1 = "I want to invest for retirement."
sentence_2 = "Should I invest in mutual funds, or should I invest in stocks?"
sentence_3 = "I should schedule an appointment with a financial planner."

In [3]:
# Import regex
import re
# Create a regex pattern to remove punctuation. 
pattern = r'[^a-zA-Z\s ]'

# Create an empty list to hold the tokens.
tokens = []

# Remove punctuation, tokenize sentence 1, and add the tokens to the tokens list.
sentence_1_cleaned = re.sub(pattern, '', sentence_1)
sentence_1_tokens = nltk.word_tokenize(sentence_1_cleaned.lower())
tokens.append(sentence_1_tokens)

# Remove punctuation, tokenize sentence 2, and add the tokens to the tokens list.
sentence_2_cleaned = re.sub(pattern, '', sentence_2)
sentence_2_tokens = nltk.word_tokenize(sentence_2_cleaned.lower())
tokens.append(sentence_2_tokens)

# Remove punctuation, tokenize sentence 3, and add the tokens to the tokens list.
sentence_3_cleaned = re.sub(pattern, '', sentence_3)
sentence_3_tokens = nltk.word_tokenize(sentence_3_cleaned.lower())
tokens.append(sentence_3_tokens)

# Display the tokens.
tokens

[['i', 'want', 'to', 'invest', 'for', 'retirement'],
 ['should',
  'i',
  'invest',
  'in',
  'mutual',
  'funds',
  'or',
  'should',
  'i',
  'invest',
  'in',
  'stocks'],
 ['i',
  'should',
  'schedule',
  'an',
  'appointment',
  'with',
  'a',
  'financial',
  'planner']]

In [4]:
# Remove stopwords
filtered_tokens = []
for token in tokens:
    filtered_token = [word for word in token if not word in stop_words]
    filtered_tokens.append(filtered_token)
    
# Diplay the filtered_tokens
filtered_tokens

[['want', 'invest', 'retirement'],
 ['invest', 'mutual', 'funds', 'invest', 'stocks'],
 ['schedule', 'appointment', 'financial', 'planner']]

In [5]:
# Create the bag-of-words
bag_of_words = {}
for i in range(len(filtered_tokens)):
    for word in filtered_tokens[i]:
        if word not in bag_of_words:
            bag_of_words[word] = 0
        bag_of_words[word] += 1

# Print the bag_of_words
print(bag_of_words)

{'want': 1,
 'invest': 3,
 'retirement': 1,
 'mutual': 1,
 'funds': 1,
 'stocks': 1,
 'schedule': 1,
 'appointment': 1,
 'financial': 1,
 'planner': 1}

### Using scikit-learn's `CountVectorizer` demonstrate how a BoW is created.

In [6]:
# Import the dependencies
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [7]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer to the input sentences and transform them into a bag of words
bag_of_words = vectorizer.fit_transform([sentence_1,sentence_2, sentence_3])

# Print the resulting bag of words
print(bag_of_words.toarray())

[[0 0 0 1 0 0 1 0 0 1]
 [0 0 1 2 1 0 0 0 1 0]
 [1 1 0 0 0 1 0 1 0 0]]


In [8]:
# Create a DataFrame of the bag of words. 
bow_df = pd.DataFrame(bag_of_words.toarray(),columns=vectorizer.get_feature_names_out())
bow_df

Unnamed: 0,appointment,financial,funds,invest,mutual,planner,retirement,schedule,stocks,want
0,0,0,0,1,0,0,1,0,0,1
1,0,0,1,2,1,0,0,0,1,0
2,1,1,0,0,0,1,0,1,0,0


In [9]:
# Print the vocabulary. 
print(bow_df.columns.to_list())

['appointment', 'financial', 'funds', 'invest', 'mutual', 'planner', 'retirement', 'schedule', 'stocks', 'want']


In [10]:
# Get the number of times each word appears in the vocabulary.
occurrence = bow_df.sum(axis=0)
print(occurrence)

appointment    1
financial      1
funds          1
invest         3
mutual         1
planner        1
retirement     1
schedule       1
stocks         1
want           1
dtype: int64
