**Text Representation Using Bag Of Words(BOW)**

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts() #Checking the no. of spam and ham

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
#Map spam as 1 and ham as 0
df['spam']=df['Category'].apply(lambda x:1 if x=='spam' else 0)

In [5]:
df.shape

(5572, 3)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.Message,df.spam,test_size=0.2)

In [8]:
X_train.shape

(4457,)

In [9]:
X_test.shape

(1115,)

In [10]:
type(X_train)

In [11]:
X_train[:4]

Unnamed: 0,Message
1948,The battery is for mr adewale my uncle. Aka Egbon
5360,"Hey, iouri gave me your number, I'm wylie, rya..."
1676,"Painful words- ""I thought being Happy was the ..."
259,We tried to contact you re your reply to our o...


In [12]:
type(y_train)

In [13]:
y_train[:4]

Unnamed: 0,spam
1948,0
5360,0
1676,0
259,1


In [14]:
type(X_train.values)

numpy.ndarray

**Create bag of words representation using CountVectorizer**

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()

X_train_cv=v.fit_transform(X_train.values)
X_train_cv

<4457x7711 sparse matrix of type '<class 'numpy.int64'>'
	with 59371 stored elements in Compressed Sparse Row format>

In [16]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [17]:
X_train_cv.shape

(4457, 7711)

In [18]:
v.get_feature_names_out()[1771]

'checked'

In [19]:
v.vocabulary

In [20]:
X_train_np=X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [21]:
np.where(X_train_np[0]!=0)

(array([ 821,  895, 1263, 2546, 2935, 3721, 4587, 4638, 6792, 7108]),)

In [22]:
X_train_np[0][1579]

0

**Train The Naive Bayes Model**

In [23]:
from sklearn.naive_bayes import MultinomialNB

model=MultinomialNB()
model.fit(X_train_cv,y_train)


In [24]:
X_test_cv=v.transform(X_test)

**Evaluate Performance**

In [25]:
from sklearn.metrics import classification_report
y_pred=model.predict(X_test_cv)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       980
           1       0.99      0.93      0.96       135

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [26]:

emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

**Train the model using sklearn pipeline and reduce number of lines of codet**

In [27]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [28]:
clf.fit(X_train,y_train)

In [29]:
y_pred=clf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       980
           1       0.99      0.93      0.96       135

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115

