### Bag of words model

In [1]:
# load all necessary libraries
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('max_colwidth', 100)

#### Let's build a basic bag of words model on three sample documents

In [2]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SHAMBHAVVISEN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHAMBHAVVISEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']


#### Creating bag of words model using count vectorizer function

In [4]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the rown and column number of cells which have 1 as value

  (0, 2)	1
  (0, 10)	1
  (0, 3)	1
  (0, 4)	1
  (1, 4)	1
  (1, 9)	1
  (1, 1)	1
  (1, 7)	1
  (1, 0)	1
  (2, 6)	1
  (2, 5)	1
  (2, 8)	1
  (2, 11)	1


In [5]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 0 1 0]
 [1 1 0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1]]


In [6]:
print(bow_model.shape)
print(vectorizer.get_feature_names_out())

(3, 12)
['actors' 'depends' 'gangs' 'great' 'movie' 'movies' 'new' 'performance'
 'releasing' 'success' 'wasseypur' 'week']


### Let's create a bag of words model on the spam dataset.

In [35]:
# load data
spam = pd.read_csv("SMSSpamCollection.csv", sep="\t+", names=["label", "message"], engine="python")
spam.head()

Unnamed: 0,label,message
0,"""ham","Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,"""ham","Ok lar... Joking wif u oni..."""
2,"""spam",Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,"""ham","U dun say so early hor... U c already then say..."""
4,"""ham","Nah I don't think he goes to usf, he lives around here though"""


##### Let's take a subset of data (first 50 rows only) and create bag of word model on that.

In [36]:
spam = spam
print(spam)


      label  \
0      "ham   
1      "ham   
2     "spam   
3      "ham   
4      "ham   
...     ...   
5569  "spam   
5570   "ham   
5571   "ham   
5572   "ham   
5573   "ham   

                                                                                                  message  
0     Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...  
1                                                                          Ok lar... Joking wif u oni..."  
2     Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...  
3                                                      U dun say so early hor... U c already then say..."  
4                                          Nah I don't think he goes to usf, he lives around here though"  
...                                                                                                   ...  
5569  This is the 2nd time we have tried 2 contact u. U have wo

In [37]:
# extract the messages from the dataframe
messages = spam.message
print(messages)

0       Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...
1                                                                            Ok lar... Joking wif u oni..."
2       Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3                                                        U dun say so early hor... U c already then say..."
4                                            Nah I don't think he goes to usf, he lives around here though"
                                                       ...                                                 
5569    This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...
5570                                                                  Will ü b going to esplanade fr home?"
5571                                             Pity, * was in mood for that. So...any other suggestions?"
5572    The guy did some bit

In [38]:
# convert messages into list
messages = [message for message in messages]
# print(messages)
len(messages)

5574

In [39]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
# print(messages)
len(messages)

5574

In [45]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)
bow_model

<5574x8645 sparse matrix of type '<class 'numpy.int64'>'
	with 48245 stored elements in Compressed Sparse Row format>

In [41]:
# look at the dataframe
pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
print(len(vectorizer.get_feature_names_out()))

8645


* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. 

In [43]:
bow_model.shape

(5574, 8645)