# Classification with Bag of Words

In [None]:
import numpy as np

In [None]:
# Sample corpus 

corpus_fox = [
    "the quick brown fox jumps over the lazy dog",
    "the quick brown fox is very quick",
    "the quick brown fox jumps",
    "the lazy dog jumps over the quick fox"
]

# Exercise: Bag-of-Words model from Scratch

1) Define function to get the vocabulary of a corpus as a **sorted** list of unique words
2) Define function to get the bag-of-words representation for the corpus as a 2D numpy array

In [None]:
# define a function to get the unique list of words as corpus
# take care that it is sorted
def get_vocabulary(corpus):
    
    ### YOUR SOLUTION HERE
    ### END OF SOLUTION
    
    return vocab_list

In [None]:
vocab = get_vocabulary(corpus_fox)
print(vocab)

In [None]:
# test if your implementation is correct
v_test = get_vocabulary(corpus_fox)
assert len(v_test) == 10
assert ['brown', 'dog', 'fox', 'is', 'jumps', 'lazy', 'over', 'quick', 'the', 'very'] == v_test

print("=========== Tests passed =============")

In [None]:
# define a function to get the bag of words or term frequency matrix
# it should return a 2D numpy array
# each row represents a document and each column a word from the vocabulary

def calculate_term_frequency_matrix(doc):
    vocabulary = get_vocabulary(doc)
    word_to_id = {word: i for i, word in enumerate(vocabulary)} # dict mapping from word to index
    tf_matrix = np.zeros((len(doc), len(vocabulary))) 
    
   ### YOUR SOLUTION HERE
    ### END OF SOLUTION
    
    return tf_matrix

In [None]:
tf_matrix = calculate_term_frequency_matrix(corpus_fox)
tf_matrix

In [None]:
# test if your implementation is correct
test_matrix = np.array([[1., 1., 1., 0., 1., 1., 1., 1., 2., 0.],
       [1., 0., 1., 1., 0., 0., 0., 2., 1., 1.],
       [1., 0., 1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 1., 1., 0., 1., 1., 1., 1., 2., 0.]])

assert np.array_equal(tf_matrix, test_matrix), "The term frequency matrix is not correct"
print("=========== Tests passed =============")

# Bag of Words model with scikit-learn

This source code simply shows how to get the matrices using sklearn.  There are more libraries out there for statistical NLP.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the corpus and transform the corpus into a Bag of Words matrix
bow_matrix = vectorizer.fit_transform(corpus_fox)

# Get the list of unique words (vocabulary)
# it returns a sparse matrix!
vocab = vectorizer.get_feature_names_out()

# Convert the Bag of Words matrix to a dense numpy array for easier manipulation
bow_matrix_dense = bow_matrix.toarray()

# Print the results
print("Vocabulary:")
print(vocab)
print("\nSize of vocabulary:", len(vocab))
print("\nBag of Words matrix (dense):")
print(bow_matrix_dense)
print("\nBag of Words matrix (sparse):")
print(bow_matrix)


assert len(vocab) == 10, "The length of the vocabulary is not correct"
assert np.array_equal(bow_matrix_dense, tf_matrix), "The Bag of Words matrix is not correct"

print("=========== Tests passed =============")

# N-grams with scikit-learn 

Almost the same steps as before, but we can specify the n-gram range when creating the CountVectorizer instance!

In [None]:
N = (3, 3)
vectorizer = CountVectorizer(ngram_range=N)

bow_matrix = vectorizer.fit_transform(corpus_fox)
vocab = vectorizer.get_feature_names_out()
bow_matrix_dense = bow_matrix.toarray()

# Print the results
print("Vocabulary:")
print(vocab)
print("\nSize of vocabulary:", len(vocab))
print("\nBag of Words matrix (dense):")
print(bow_matrix_dense)

# Classification with Bag of Words



In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Training data
corpus = [
    'I love this product',
    'This is a great product',
    'I hate this product',
    'This product is not good',
    'I am very happy with this product',
    'I am not satisfied with this product'
]

# Labels (1 for positive, 0 for negative)
labels = np.array([1, 1, 0, 0, 1, 0])

# Step 2: Use CountVectorizer to convert the corpus into a matrix of token counts
# Create an instance of CountVectorizer with n-gram range (1, 2)
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Fit the vectorizer to the corpus and transform the corpus into n-gram count vectors
X = vectorizer.fit_transform(corpus)

# Step 3: Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X, labels)

# Evaluate the classifier on an unseen test set
new_reviews = [
    'I love this product, it is amazing!',
    'I hate this product, it is terrible!',
    'It is rather bad'
]

new_labels = np.array([1, 0,0])


### YOUR SOLUTION HERE
# Convert the new reviews into n-gram count vectors
# Predict the labels for the test set
### END OF SOLUTION

# Evaluate the classifier
accuracy = accuracy_score(new_labels, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
# accuracy should be 66.67%

# Think about: Why is the last review misclassified?