# Bag of Words

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.DataFrame(
    {
        "Text": ['I love cats','cats are cute','I like cats','cats are cats','cats watching cats','my life my rule']
    
})

In [3]:
df

Unnamed: 0,Text
0,I love cats
1,cats are cute
2,I like cats
3,cats are cats
4,cats watching cats
5,my life my rule


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(lowercase=False, tokenizer = lambda txt : txt.split())

In [5]:
bow = cv.fit_transform(df['Text'])

In [6]:
bow

<6x10 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [7]:
cv.vocabulary_ # index number

{'I': 0,
 'love': 6,
 'cats': 2,
 'are': 1,
 'cute': 3,
 'like': 5,
 'watching': 9,
 'my': 7,
 'life': 4,
 'rule': 8}

In [8]:
bow[5].toarray()

array([[0, 0, 0, 0, 1, 0, 0, 2, 1, 0]], dtype=int64)

# One Hot Encoding

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Sample list of documents
documents = [
    'I love cats',
    'cats are cute',
    'I like cats',
    'cats are cats',
    'cats watching cats',
    'my life my rule'
]

# Create a CountVectorizer object for one-hot encoding
vectorizer = CountVectorizer(binary=True,tokenizer = lambda txt : txt.split(),lowercase=False)

# Fit and transform the documents
one_hot_encoded = vectorizer.fit_transform(documents)

# Convert the one-hot encoded matrix to a DataFrame for better display
one_hot_df = pd.DataFrame(one_hot_encoded.toarray(), columns=vectorizer.get_feature_names_out())

# Display the one-hot encoded representation in a table format
print(one_hot_df)


   I  are  cats  cute  life  like  love  my  rule  watching
0  1    0     1     0     0     0     1   0     0         0
1  0    1     1     1     0     0     0   0     0         0
2  1    0     1     0     0     1     0   0     0         0
3  0    1     1     0     0     0     0   0     0         0
4  0    0     1     0     0     0     0   0     0         1
5  0    0     0     0     1     0     0   1     1         0


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample list of documents
documents = [
    'I love cats',
    'cats are cute',
    'I like cats',
    'cats are cats',
    'cats watching cats',
    'my life my rule'
]

# Create a CountVectorizer object for one-hot encoding
vectorizer = CountVectorizer(binary=True,tokenizer = lambda txt : txt.split(),lowercase=False)

# Fit and transform the documents
one_hot_encoded = vectorizer.fit_transform(documents)

# Convert the one-hot encoded matrix to an array
one_hot_encoded_array = one_hot_encoded.toarray()

# Display the one-hot encoded representation
print(one_hot_encoded_array)


[[1 0 1 0 0 0 1 0 0 0]
 [0 1 1 1 0 0 0 0 0 0]
 [1 0 1 0 0 1 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 1 1 0]]


In [11]:
# Define the sentences
sentences = ['I love cats','cats are cute','I like cats','cats are cats','cats watching cats','my life my rule']

# Create a vocabulary set
vocab = set()
for sentence in sentences:
        words = sentence.lower().split()
        for word in words:
            vocab.add(word)

# Create a dictionary to map words to integers
word_to_int = {word: i for i, word in enumerate(vocab)}

# Create a binary vector for each word in each sentence
vectors = []
for sentence in sentences:
        words = sentence.lower().split()
        sentence_vectors = []
        for word in words:
            binary_vector = np.zeros(len(vocab))
            binary_vector[word_to_int[word]] = 1
            sentence_vectors.append(binary_vector)
        vectors.append(sentence_vectors)

# Print the one-hot encoded vectors for each word in each sentence
for i in range(len(sentences)):
        print(f"Sentences {i + 1}:")
        for j in range(len(vectors[i])):
            print(f"{sentences[i].split()[j]}: {vectors[i][j]}")


Sentences 1:
I: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
love: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
Sentences 2:
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
are: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
cute: [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
Sentences 3:
I: [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
like: [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
Sentences 4:
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
are: [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
Sentences 5:
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
watching: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
cats: [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
Sentences 6:
my: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
life: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
my: [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
rule: [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]


# N-Grams

In [12]:
from nltk import ngrams

In [13]:
sentence = "this is a foo bar sentences and i want to ngramize it"

In [14]:
n=4
ngramsres = ngrams(sentence.split(),n)
for grams in ngramsres:
    print(grams)

('this', 'is', 'a', 'foo')
('is', 'a', 'foo', 'bar')
('a', 'foo', 'bar', 'sentences')
('foo', 'bar', 'sentences', 'and')
('bar', 'sentences', 'and', 'i')
('sentences', 'and', 'i', 'want')
('and', 'i', 'want', 'to')
('i', 'want', 'to', 'ngramize')
('want', 'to', 'ngramize', 'it')
