In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd
import string

In [2]:
sentence_1 = 'It is a good practice for us'
sentence_2 = 'It was also good to know about it'

In [3]:
text = sentence_1 + " " + sentence_2

In [4]:
columns = []

for word in word_tokenize(text.lower()):
    if word in string.punctuation:
        continue
    columns.append(word)

In [5]:
columns = pd.Series(columns)

In [6]:
columns = list(columns.unique())

In [7]:
columns

['it',
 'is',
 'a',
 'good',
 'practice',
 'for',
 'us',
 'was',
 'also',
 'to',
 'know',
 'about']

In [23]:
bag_of_words = pd.DataFrame(index = [0, 1], columns = columns)

In [25]:
bag_of_words

Unnamed: 0,it,is,a,good,practice,for,us,was,also,to,know,about
0,,,,,,,,,,,,
1,,,,,,,,,,,,


In [27]:
count1 = [sentence_1.count(word) for word in columns]
count2 = [sentence_2.count(word) for word in columns]

In [29]:
bag_of_words.iloc[0:] = count1
bag_of_words.iloc[1:] = count2

In [31]:
bag_of_words

Unnamed: 0,it,is,a,good,practice,for,us,was,also,to,know,about
0,0,1,2,1,1,1,1,0,0,0,0,0
1,1,0,3,1,0,0,0,1,1,1,1,1


Using Inbuilt Library

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [36]:
# pip install scikit-learn

In [38]:
cvt = CountVectorizer()

In [40]:
x_new = cvt.fit_transform([sentence_1, sentence_2])

In [42]:
x_new

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [44]:
x_new.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0],
       [1, 1, 0, 1, 0, 2, 1, 0, 1, 0, 1]], dtype=int64)

In [46]:
cvt.get_feature_names_out()

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [48]:
df = pd.DataFrame(data = x_new.toarray(), columns = cvt.get_feature_names_out())

In [50]:
df

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0,0,1,1,1,1,0,1,0,1,0
1,1,1,0,1,0,2,1,0,1,0,1


In [64]:
new = 'It was good for us.'

In [66]:
new_features = cvt.transform([new])

In [68]:
new_features.toarray()

array([[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1]], dtype=int64)

# N-grams

In [71]:
help(CountVectorizer)

Help on class CountVectorizer in module sklearn.feature_extraction.text:

class CountVectorizer(_VectorizerMixin, sklearn.base.BaseEstimator)
 |  CountVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), analyzer='word', max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>)
 |
 |  Convert a collection of text documents to a matrix of token counts.
 |
 |  This implementation produces a sparse representation of the counts using
 |  scipy.sparse.csr_matrix.
 |
 |  If you do not provide an a-priori dictionary and you do not use an analyzer
 |  that does some kind of feature selection then the number of features will
 |  be equal to the vocabulary size found by analyzing the data.
 |
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examp

In [89]:
ngram = CountVectorizer(ngram_range = (1,2))

In [91]:
new_ng = ngram.fit_transform([sentence_1, sentence_2])

In [93]:
ndf = pd.DataFrame(data = new_ng.toarray(), columns = ngram.get_feature_names_out())

In [79]:
ndf

Unnamed: 0,about,about it,also,also good,for,for us,good,good practice,good to,is,...,it was,know,know about,practice,practice for,to,to know,us,was,was also
0,0,0,0,0,1,1,1,1,0,1,...,0,0,0,1,1,0,0,1,0,0
1,1,1,1,1,0,0,1,0,1,0,...,1,1,1,0,0,1,1,0,1,1


In [81]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   about          2 non-null      int64
 1   about it       2 non-null      int64
 2   also           2 non-null      int64
 3   also good      2 non-null      int64
 4   for            2 non-null      int64
 5   for us         2 non-null      int64
 6   good           2 non-null      int64
 7   good practice  2 non-null      int64
 8   good to        2 non-null      int64
 9   is             2 non-null      int64
 10  is good        2 non-null      int64
 11  it             2 non-null      int64
 12  it is          2 non-null      int64
 13  it was         2 non-null      int64
 14  know           2 non-null      int64
 15  know about     2 non-null      int64
 16  practice       2 non-null      int64
 17  practice for   2 non-null      int64
 18  to             2 non-null      int64
 19  to know     

In [97]:
ngram = CountVectorizer(ngram_range = (2,2))
new_ng = ngram.fit_transform([sentence_1, sentence_2])
ndf = pd.DataFrame(data = new_ng.toarray(), columns = ngram.get_feature_names_out())
ndf

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1
