### Bag of words

In [7]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'

In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize
s1 = word_tokenize(sent1.lower())
s2 = word_tokenize(sent2.lower())
s1


['it', 'is', 'good', 'practice', 'for', 'us', '.']

In [9]:
s3 = s1+s2
s3 = list(set(s3))
type(s3)


list

In [10]:
df = pd.DataFrame(columns=s3)
cols = df.columns
a = [s1.count(col) for col in cols]  # col[0] to get the string from tuple
b = [s2.count(col) for col in cols]
df.loc[0] = a
df.loc[1] = b
df

Unnamed: 0,about,was,practice,to,.,good,us,also,it,know,is,for
0,0,0,1,0,1,1,1,0,1,0,1,1
1,1,1,0,1,1,1,0,1,2,1,0,0


# Count vectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cvt = CountVectorizer()
new_data = cvt.fit_transform([sent1,sent2])
new_data.toarray


<bound method _cs_matrix.toarray of <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 13 stored elements and shape (2, 11)>>

In [12]:
features = cvt.get_feature_names_out()

In [13]:
df = pd.DataFrame(new_data.toarray(), columns=features)
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [14]:
sent3 = "It was about good practice."
new_data = cvt.fit_transform([sent1, sent2, sent3])
df = pd.DataFrame(new_data.toarray(), columns=cvt.get_feature_names_out())
df


Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1
2,1,0,0,1,0,1,0,1,0,0,1


In [15]:
# or add using this
cvt.transform([sent3]).toarray()

array([[1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]])

In [21]:
sent4 = 'Inshallah good practice. Boys played well.'
cvt.transform([sent4]).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]])

# N-GRAM VECTORIZATION

In [16]:
# help(CountVectorizer)

<p>
ngram_range=(2,2) means: Create word pairs from consecutive words

Examples:

    Text: "I love python"

    Bigrams (2-word pairs): "I love", "love python"

What ngram_range does:

    (1,1) = single words only

    (2,2) = word pairs only

    (1,2) = single words AND word pairs

    (2,3) = word pairs AND three-word sequences
</p>

In [17]:
ngram_cvt = CountVectorizer(ngram_range=(2,2)) #explanation above
new_data = ngram_cvt.fit_transform([sent1, sent2])
new_data

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 12 stored elements and shape (2, 12)>

In [18]:
features = ngram_cvt.get_feature_names_out()
features

array(['about it', 'also good', 'for us', 'good practice', 'good to',
       'is good', 'it is', 'it was', 'know about', 'practice for',
       'to know', 'was also'], dtype=object)

In [19]:
df = pd.DataFrame(new_data.toarray(), columns=features)
df

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1
