# Bag of words

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = [' Most shark attacks occur about 10 feet from the beach since that is where the people are',
        'the efficiency with which he paired the socks in the drawer was quite admirable',
        'carol drank the blood as if she were a vampire',
        'giving directions that the mountains are to the west only works when you can see them',
        'the sign said there was road work ahead so he decided to speed up',
        'the gruff old man sat in the back of the bait shop grumbling to himself as he scooped out a handful of worms']

In [3]:
countvec = CountVectorizer()

In [4]:
countvec_fit = countvec.fit_transform(data)

In [5]:
countvec_fit

<6x71 sparse matrix of type '<class 'numpy.int64'>'
	with 85 stored elements in Compressed Sparse Row format>

In [11]:
countvec.get_feature_names_out()

array(['10', 'about', 'admirable', 'ahead', 'are', 'as', 'attacks',
       'back', 'bait', 'beach', 'blood', 'can', 'carol', 'decided',
       'directions', 'drank', 'drawer', 'efficiency', 'feet', 'from',
       'giving', 'gruff', 'grumbling', 'handful', 'he', 'himself', 'if',
       'in', 'is', 'man', 'most', 'mountains', 'occur', 'of', 'old',
       'only', 'out', 'paired', 'people', 'quite', 'road', 'said', 'sat',
       'scooped', 'see', 'shark', 'she', 'shop', 'sign', 'since', 'so',
       'socks', 'speed', 'that', 'the', 'them', 'there', 'to', 'up',
       'vampire', 'was', 'were', 'west', 'when', 'where', 'which', 'with',
       'work', 'works', 'worms', 'you'], dtype=object)

In [12]:
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns=countvec.get_feature_names_out())
bag_of_words

Unnamed: 0,10,about,admirable,ahead,are,as,attacks,back,bait,beach,blood,can,carol,decided,directions,drank,drawer,efficiency,feet,from,giving,gruff,grumbling,handful,he,himself,if,in,is,man,most,mountains,occur,of,old,only,out,paired,people,quite,road,said,sat,scooped,see,shark,she,shop,sign,since,so,socks,speed,that,the,them,there,to,up,vampire,was,were,west,when,where,which,with,work,works,worms,you
0,1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,2,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1
4,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0
5,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,2,1,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


# TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidfvec = TfidfVectorizer()
tfidf_fit = tfidfvec.fit_transform(data)
tfidf_bag = pd.DataFrame(tfidf_fit.toarray(), columns=tfidfvec.get_feature_names_out())
tfidf_bag

Unnamed: 0,10,about,admirable,ahead,are,as,attacks,back,bait,beach,blood,can,carol,decided,directions,drank,drawer,efficiency,feet,from,giving,gruff,grumbling,handful,he,himself,if,in,is,man,most,mountains,occur,of,old,only,out,paired,people,quite,road,said,sat,scooped,see,shark,she,shop,sign,since,so,socks,speed,that,the,them,there,to,up,vampire,was,were,west,when,where,which,with,work,works,worms,you
0,0.257061,0.257061,0.0,0.0,0.210794,0.0,0.257061,0.0,0.0,0.257061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257061,0.257061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257061,0.0,0.257061,0.0,0.257061,0.0,0.0,0.0,0.0,0.0,0.257061,0.0,0.0,0.0,0.0,0.0,0.0,0.257061,0.0,0.0,0.0,0.257061,0.0,0.0,0.0,0.210794,0.228219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257061,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.293641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293641,0.293641,0.0,0.0,0.0,0.0,0.0,0.0,0.203291,0.0,0.0,0.24079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293641,0.0,0.293641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.293641,0.0,0.0,0.391041,0.0,0.0,0.0,0.0,0.0,0.24079,0.0,0.0,0.0,0.0,0.293641,0.293641,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.292313,0.0,0.0,0.0,0.0,0.356474,0.0,0.356474,0.0,0.0,0.356474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158238,0.0,0.0,0.0,0.0,0.356474,0.0,0.356474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.222257,0.0,0.0,0.0,0.0,0.0,0.0,0.27104,0.0,0.0,0.27104,0.0,0.0,0.0,0.0,0.0,0.27104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27104,0.0,0.0,0.0,0.27104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222257,0.240629,0.27104,0.0,0.187645,0.0,0.0,0.0,0.0,0.27104,0.27104,0.0,0.0,0.0,0.0,0.27104,0.0,0.27104
4,0.0,0.0,0.0,0.290766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.290766,0.290766,0.0,0.0,0.0,0.0,0.0,0.0,0.290766,0.0,0.290766,0.0,0.290766,0.0,0.129071,0.0,0.290766,0.201301,0.290766,0.0,0.238432,0.0,0.0,0.0,0.0,0.0,0.0,0.290766,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.178615,0.0,0.21782,0.21782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21782,0.21782,0.21782,0.150799,0.21782,0.0,0.178615,0.0,0.21782,0.0,0.0,0.0,0.435639,0.21782,0.0,0.21782,0.0,0.0,0.0,0.0,0.0,0.21782,0.21782,0.0,0.0,0.0,0.21782,0.0,0.0,0.0,0.0,0.0,0.0,0.29007,0.0,0.0,0.150799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21782,0.0
