**Importing packages**

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

**Reading the dataset**

In [34]:
path = 'SMSSpamCollection'
features = ['label', 'message']
sms = pd.read_table(path, header=None, names=features)
# ham equal to 0 , spam equal to 1
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
# split X and y into training and testing sets (allocating 80% of data to train data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8,random_state=1)

**Vectorizing our dataset**

In [35]:
# instantiate the vectorizer
vect = CountVectorizer()
# fit and transform training data
X_train_dtm = vect.fit_transform(X_train)
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

**Building a model**

In [36]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
# train the model 
%time nb.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

CPU times: user 1.76 ms, sys: 1.05 ms, total: 2.8 ms
Wall time: 2.86 ms


**Evaluating the model**

In [37]:
# calculate : Accuracy , Recall , Precision , F1 measure
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_class)
Precision = confusion_matrix[0][0]/(confusion_matrix[0][0]+confusion_matrix[0][1])
Recall = confusion_matrix[0][0]/(confusion_matrix[0][0]+confusion_matrix[1][0])
Accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/(confusion_matrix[0][0]+confusion_matrix[1][0]+confusion_matrix[0][1]+confusion_matrix[1][1])
F1_measure = (2*Precision*Recall)/(Recall+Precision)
# Print : Confusion matrix , Accuracy , Recall , Precision , F1 measure
print("Confusion matrix is = \n",confusion_matrix,"\n","----------")
print("Accuracy is = \n",100*Accuracy," %","\n","----------")
print("Recall is = \n",Recall,"\n","----------")
print("Precision is = \n",Precision,"\n","----------")
print("F1 measure is = \n",F1_measure)

Confusion matrix is = 
 [[3840   18]
 [  89  511]] 
 ----------
Accuracy is = 
 97.59982054733064  % 
 ----------
Recall is = 
 0.9773479256808348 
 ----------
Precision is = 
 0.995334370139969 
 ----------
F1 measure is = 
 0.98625914986516


In [42]:
# false positives that ham incorrectly classified as spam
X_test[y_pred_class > y_test]

574                                Waiting for your call.
3375                              Also andros ice etc etc
1729    As per your request 'Maangalyam (Alaipayuthe)'...
216     Finally the match heading towards draw as your...
4557                              Gettin rdy to ship comp
4490             The new deus ex game comin early next yr
7       As per your request 'Melle Melle (Oru Minnamin...
1672                              Glad to see your reply.
4382               Mathews or tait or edwards or anderson
154     As per your request 'Melle Melle (Oru Minnamin...
1082                    Can u get pic msgs to your phone?
2173     Yavnt tried yet and never played original either
1506    Total video converter free download type this ...
103     As per your request 'Melle Melle (Oru Minnamin...
3649    As per your request 'Maangalyam (Alaipayuthe)'...
4600              Have you laid your airtel line to rest?
3728        Aldrine, rakhesh ex RTM here.pls call.urgent.
1290    Hey...

In [43]:
# false negatives that spam incorrectly classified as ham
X_test[y_pred_class < y_test]

1217    You have 1 new voicemail. Please call 08719181...
881     Reminder: You have not downloaded the content ...
3132    LookAtMe!: Thanks for your purchase of a video...
2295     You have 1 new message. Please call 08718738034.
420     Send a logo 2 ur lover - 2 names joined by a h...
                              ...                        
1252    Please CALL 08712402779 immediately as there i...
815     U were outbid by simonwatson5120 on the Shinco...
1699    Free msg. Sorry, a service you ordered from 81...
3360    Sorry I missed your call let's talk when you h...
4311    Someone U know has asked our dating service 2 ...
Name: message, Length: 89, dtype: object