In [66]:
import numpy as np
import pandas as pd

In [67]:
df = pd.DataFrame({'text':['People watch campusx','Campusx watch Campusx', 'People write comment', 'Campusx write comment'], 'output':[1,1,0,0]})

In [68]:
df

Unnamed: 0,text,output
0,People watch campusx,1
1,Campusx watch Campusx,1
2,People write comment,0
3,Campusx write comment,0


In [69]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

# Bag of Words

bow = cv.fit_transform(df['text'])

In [70]:
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


- ```Here the numbers indicate the order of the words in the vocabolary list.```

In [71]:
print(bow.toarray())

[[1 0 1 1 0]
 [2 0 0 1 0]
 [0 1 1 0 1]
 [1 1 0 0 1]]


In [72]:
# Now we can test with new sentences.

cv.transform(['Campusx write and watch comment']).toarray()

array([[1, 1, 0, 1, 1]])

# 

# N-grams

In [73]:
cv = CountVectorizer(ngram_range = (2,2))

In [74]:
bi_gram = cv.fit_transform(df['text'])

In [75]:
print(cv.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comment': 5, 'campusx write': 1}


- ```Here the vocabulary is generated taking two words at the same time.```

In [76]:
# if we set the range to (1,2) it will consider both unigrams and bigrams.

cv = CountVectorizer(ngram_range = (1,2))

In [77]:
uni_bi_both = cv.fit_transform(df['text'])

In [78]:
print(cv.vocabulary_)

{'people': 4, 'watch': 7, 'campusx': 0, 'people watch': 5, 'watch campusx': 8, 'campusx watch': 1, 'write': 9, 'comment': 3, 'people write': 6, 'write comment': 10, 'campusx write': 2}


In [79]:
print(len(cv.vocabulary_))

11


In [80]:
# we can do similar for trigrams by setting ngram_range = (3,3)

cv = CountVectorizer(ngram_range = (3,3))

In [81]:
trigrams = cv.fit_transform(df['text'])

In [82]:
print(cv.vocabulary_)

{'people watch campusx': 2, 'campusx watch campusx': 0, 'people write comment': 3, 'campusx write comment': 1}


# Tf-Idf

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [84]:
tf_idf = TfidfVectorizer()

In [85]:
tf_idf.fit_transform(df['text'])

<4x5 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [86]:
print(tf_idf.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [87]:
print(tf_idf.transform(df['text']).toarray())

[[0.49681612 0.         0.61366674 0.61366674 0.        ]
 [0.8508161  0.         0.         0.52546357 0.        ]
 [0.         0.57735027 0.57735027 0.         0.57735027]
 [0.49681612 0.61366674 0.         0.         0.61366674]]
