# Bag of Words

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({
    "text":["people watch cs",
            "cs watch cs",
            "people write comment",
            "cs write comment"], "output":[1,1,0,0]
})
df

Unnamed: 0,text,output
0,people watch cs,1
1,cs watch cs,1
2,people write comment,0
3,cs write comment,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [5]:
bow = cv.fit_transform(df['text'])

# vocabulary
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'cs': 1, 'write': 4, 'comment': 0}


In [6]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 1 1 1 0]]
[[0 2 0 1 0]]
[[1 0 1 0 1]]


In [7]:
# new
cv.transform(['ai watch cs']).toarray()

array([[0, 1, 0, 1, 0]])

# N-grams

In [8]:
df = pd.DataFrame({
    "text":["people watch cs",
            "cs watch cs",
            "people write comment",
            "cs write comment"], "output":[1,1,0,0]
})
df

Unnamed: 0,text,output
0,people watch cs,1
1,cs watch cs,1
2,people write comment,0
3,cs write comment,0


In [9]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [10]:
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)

{'people watch': 2, 'watch cs': 4, 'cs watch': 0, 'people write': 3, 'write comment': 5, 'cs write': 1}


In [11]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]


# TF-IDF (Term Frequency - Inverse Document Frequency)

In [12]:
df = pd.DataFrame({
    "text":["people watch cs",
            "cs watch cs",
            "people write comment",
            "cs write comment"], "output":[1,1,0,0]
})
df

Unnamed: 0,text,output
0,people watch cs,1
1,cs watch cs,1
2,people write comment,0
3,cs write comment,0


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()

In [15]:
tfid.fit_transform(df['text']).toarray()

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [16]:
print(tfid.idf_)

[1.51082562 1.22314355 1.51082562 1.51082562 1.51082562]


In [17]:
print(tfid.get_feature_names_out())

['comment' 'cs' 'people' 'watch' 'write']
