In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import sklearn
import matplotlib.pyplot as plt


In [23]:
data_path = ("SMSSpamCollection")
sms_raw = pd.read_table('SMSSpamCollection', header=None)
sms_raw.columns = ['spam', 'message']


In [24]:
sms_raw.head(2)

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [25]:
sms_raw.spam.value_counts()

ham     4825
spam     747
Name: spam, dtype: int64

In [30]:
keywords = ['click','offer','buy','free','cash','winner','urgent','money']
#these are words which are there in spam messages
for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(''+str(key)+'', case = False)

In [31]:
    sms_raw.head(10)

Unnamed: 0,spam,message,click,offer,buy,free,cash,winner,urgent,money
0,ham,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False
1,ham,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,True,False,False,False,False
3,ham,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False
5,spam,FreeMsg Hey there darling it's been 3 week's n...,False,False,False,True,False,False,False,False
6,ham,Even my brother is not like to speak with me. ...,False,False,False,False,False,False,False,False
7,ham,As per your request 'Melle Melle (Oru Minnamin...,False,False,False,False,False,False,False,False
8,spam,WINNER!! As a valued network customer you have...,False,False,False,False,False,True,False,False
9,spam,Had your mobile 11 months or more? U R entitle...,False,False,False,True,False,False,False,False


In [33]:
#having all letter caps is also a good indicator for spam
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw.head(10)

Unnamed: 0,spam,message,click,offer,buy,free,cash,winner,urgent,money,allcaps
0,ham,"Go until jurong point, crazy.. Available only ...",False,False,False,False,False,False,False,False,False
1,ham,Ok lar... Joking wif u oni...,False,False,False,False,False,False,False,False,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,False,False,False,True,False,False,False,False,False
3,ham,U dun say so early hor... U c already then say...,False,False,False,False,False,False,False,False,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False,False,False,False,False,False,False,False,False
5,spam,FreeMsg Hey there darling it's been 3 week's n...,False,False,False,True,False,False,False,False,False
6,ham,Even my brother is not like to speak with me. ...,False,False,False,False,False,False,False,False,False
7,ham,As per your request 'Melle Melle (Oru Minnamin...,False,False,False,False,False,False,False,False,False
8,spam,WINNER!! As a valued network customer you have...,False,False,False,False,False,True,False,False,False
9,spam,Had your mobile 11 months or more? U R entitle...,False,False,False,True,False,False,False,False,False


In [35]:
sms_raw[sms_raw.allcaps==True]

Unnamed: 0,spam,message,click,offer,buy,free,cash,winner,urgent,money,allcaps
14,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,False,False,False,False,False,False,False,False,True
43,ham,WHO ARE YOU SEEING?,False,False,False,False,False,False,False,False,True
72,ham,HI BABE IM AT HOME NOW WANNA DO SOMETHING? XX,False,False,False,False,False,False,False,False,True
263,ham,MY NO. IN LUTON 0125698789 RING ME IF UR AROUN...,False,False,False,False,False,False,False,False,True
445,ham,HEY HEY WERETHE MONKEESPEOPLE SAY WE MONKEYARO...,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
5202,ham,WOT STUDENT DISCOUNT CAN U GET ON BOOKS?,False,False,False,False,False,False,False,False,True
5224,ham,OH FUCK. JUSWOKE UP IN A BED ON A BOATIN THE D...,False,False,False,False,False,False,False,False,True
5266,ham,HI DARLIN ITS KATE ARE U UP FOR DOIN SOMETHIN ...,False,False,False,False,False,False,False,False,True
5388,ham,NOT MUCH NO FIGHTS. IT WAS A GOOD NITE!!,False,False,False,False,False,False,False,False,True


In [37]:
data = sms_raw[keywords+['allcaps']]

target = sms_raw['spam']#we have to predict whether message is spam or not using data features

data.head(10)

Unnamed: 0,click,offer,buy,free,cash,winner,urgent,money,allcaps
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
5,False,False,False,True,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,True,False,False,False
9,False,False,False,True,False,False,False,False,False


In [38]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(data, target)
y_pred = bnb.predict(data)
print('number of mislabeled points out of a total {} points: {}'.format(data.shape[0],(target!=y_pred).sum()))

number of mislabeled points out of a total 5572 points: 501


In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[4731,   94],
       [ 407,  340]], dtype=int64)