In [1]:
from sklearn.feature_extraction.text import CountVectorizer
simple_train = ['call you tonight', 'Call me a cab', 'please call me.. please']
vect = CountVectorizer()

In [2]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train )


In [3]:
vect.get_feature_names_out()

array(['cab', 'call', 'me', 'please', 'tonight', 'you'], dtype=object)

In [4]:
# transform training data into a 'document-term matrix'
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm

<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [5]:
# convert sparse matrix to a dense matrix
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [6]:
# examine the vocabulary and document-term matrix together
# pd.DataFrame(matrix, columns=columns)
import pandas as pd
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [7]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse._csr.csr_matrix

In [8]:
# examine the sparse matrix contents
# left: coordinates of non-zero values
# right: values at that point
# CountVectorizer() will output a sparse matrix
print('sparse matrix')
print(simple_train_dtm)
print("In Matrix format\n", simple_train_dtm.toarray())

sparse matrix
  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2
In Matrix format
 [[0 1 0 0 1 1]
 [1 1 1 0 0 0]
 [0 1 1 2 0 0]]


In [9]:
# example text for model testing
#In order to make a prediction, 
#the new observation must have the same features as the training observations, both in number and meaning.
simple_test = ['Please don\'t call me']


In [10]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [11]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0
