In [None]:
# Sample data
import numpy as np
import pandas as pd
df = pd.DataFrame(
    {
     'text':['people watch campus', 'campus watch campus', 'people write comment', 'campus write comment'],
     'output':[1,1,0,0]
    }
)
df

Unnamed: 0,text,output
0,people watch campus,1
1,campus watch campus,1
2,people write comment,0
3,campus write comment,0


# Bag of Words
*  Counts no. of words in each document & represents it using a V dimensional vector

In [None]:
# Implementing bag of words using sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
bow = cv.fit_transform(df['text'])

In [None]:
# View Vocabulary
cv.vocabulary_

{'people': 2, 'watch': 3, 'campus': 0, 'write': 4, 'comment': 1}

In [None]:
# Viewing the vectors
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]
[[0 1 1 0 1]]
[[1 1 0 0 1]]


In [None]:
# Transforming a new sentence not in dataset
cv.transform(["campus watch and campus write comment"]).toarray()

array([[2, 1, 0, 1, 1]])

In [None]:
# binary=True sets all non zero counts to 1
# Concerned with wether the word exists rather it's count
cv = CountVectorizer(binary=True)
bow = cv.fit_transform(df['text'])
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[1 0 1 1 0]]
[[1 0 0 1 0]]
[[0 1 1 0 1]]
[[1 1 0 0 1]]


In [None]:
# max_features = int
# build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
cv = CountVectorizer(max_features=2)
bow = cv.fit_transform(df['text'])
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())
print(bow[3].toarray())

[[1 0]]
[[2 0]]
[[0 1]]
[[1 1]]


# N grams
- Makes combination of multiple words as vocabulary, instead of taking one word as vocabulary
- Unigram → takes one word (normal bag of words)
- Bigram → takes two contiguous word
- Trigram → takes three contiguous words

In [None]:
# Implementing bigrams using sklearn by setting range as (2, 2)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
bow.toarray()

{'people watch': 2, 'watch campus': 4, 'campus watch': 0, 'people write': 3, 'write comment': 5, 'campus write': 1}


array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]])

In [None]:
# Considering vocabulary as bigrams & unigrams by setting range as (1,2)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
bow.toarray()

{'people': 4, 'watch': 7, 'campus': 0, 'people watch': 5, 'watch campus': 8, 'campus watch': 1, 'write': 9, 'comment': 3, 'people write': 6, 'write comment': 10, 'campus write': 2}


array([[1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0],
       [2, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1],
       [1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1]])

In [None]:
# Implementing Tri-grams using sklearn by setting range as (3,3)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))
bow = cv.fit_transform(df['text'])
print(cv.vocabulary_)
bow.toarray()

{'people watch campus': 2, 'campus watch campus': 0, 'people write comment': 3, 'campus write comment': 1}


array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

# Tf-Idf
- TF (term frequency)
- IDF (Inverse document frequency)

In [None]:
# Tf-Idf using sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
tfdif = TfidfVectorizer()
tfdif.fit_transform(df['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [None]:
print(tfdif.get_feature_names_out())
print(tfdif.idf_)

['campus' 'comment' 'people' 'watch' 'write']
[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
