<a href="https://colab.research.google.com/github/Arya1790/NLP/blob/main/BagOfWords_spam_ham.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things:

1. A vocabulary of known words.
2. A measure of the presence of known words.

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded. The model is only concerned with whether known words occur in the document, not where in the document


In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
corpus = [
  'This is the first document.',
  'This document is the second document.',
  'And this is the third one.',
  'Is this the first document?',]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [3]:
# unique words
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [4]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [None]:
# Spam Ham Classification

In [5]:
file = r'/content/drive/MyDrive/Dataset/SMSSpamCollection.txt'
df = pd.read_csv(file, sep='\t', names=['label', 'message'])
df.head(2)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [8]:
df['target'] = df['label'].map({'ham':0,'spam':1})
df.head(2)

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0


In [28]:
df[df['target']==1]['clean_text'].iloc[0]

'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli'

In [9]:
df[['label','target']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
label,target,Unnamed: 2_level_1
ham,0,4825
spam,1,747


In [10]:
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
  review = re.sub('[^a-zA-Z]',' ',df['message'][i])
  review = review.lower().split()
  review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)

In [11]:
df['clean_text'] = corpus

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['target'], test_size=0.33, random_state=42, stratify=df['label'])

In [13]:
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True)
X_train_cv = cv.fit_transform(X_train.values)
X_test_cv = cv.transform(X_test)


In [14]:
cv.get_feature_names_out()[0]

'alreadi'

In [15]:
cv.vocabulary_

{'go': 23,
 'home': 31,
 'call': 6,
 'claim': 8,
 'win': 93,
 'min': 46,
 'get': 21,
 'text': 76,
 'one': 58,
 'thank': 77,
 'dear': 14,
 'happi': 28,
 'say': 67,
 'got': 25,
 'think': 79,
 'wat': 89,
 'free': 19,
 'day': 13,
 'ask': 3,
 'make': 43,
 'txt': 84,
 'love': 41,
 'amp': 1,
 'need': 52,
 'send': 69,
 'tomorrow': 82,
 'pl': 61,
 'hi': 30,
 'realli': 63,
 'oh': 56,
 'come': 11,
 'ok': 57,
 'good': 24,
 'want': 88,
 'new': 53,
 'time': 80,
 'repli': 64,
 'thing': 78,
 'morn': 49,
 'tell': 75,
 'stop': 73,
 'find': 18,
 'pick': 60,
 'friend': 20,
 'give': 22,
 'mobil': 48,
 'msg': 50,
 'well': 92,
 'life': 38,
 'dont': 15,
 'miss': 47,
 'ur': 85,
 'tri': 83,
 'lt': 42,
 'gt': 27,
 'still': 72,
 'take': 74,
 'know': 33,
 'com': 10,
 'phone': 59,
 'ye': 97,
 'meet': 44,
 'today': 81,
 'pleas': 62,
 'number': 55,
 'wait': 87,
 'week': 91,
 'messag': 45,
 'would': 95,
 'anyth': 2,
 'said': 66,
 'lor': 40,
 'see': 68,
 'work': 94,
 'hey': 29,
 'use': 86,
 'night': 54,
 'last': 34,
 '

In [16]:
X_train_np = X_train_cv.toarray()
X_train_np[1]

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [17]:
np.where(X_train_np[0]!=0)

(array([23, 31]),)

Naive Bayes Classifier

In [18]:
model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [19]:
y_pred = model.predict(X_test_cv)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1592
           1       0.90      0.81      0.85       247

    accuracy                           0.96      1839
   macro avg       0.93      0.90      0.91      1839
weighted avg       0.96      0.96      0.96      1839



In [30]:
# Test on a random datapoint
message = {"free 20% off on parking, exclusing offer just for you"}
message_cnt = cv.transform(message)

model.predict(message_cnt)

array([1])