In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
sms_messages=pd.read_csv('SMSSpamCollection',sep='\t', names=['label','message'])

In [11]:
sms_messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
sms_messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [15]:
sms_messages.tail()

Unnamed: 0,label,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [17]:
sms_messages.describe()

Unnamed: 0,label,message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [18]:
sms_messages.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [20]:
sms_messages['label_number']=sms_messages.label.map({'ham':0,'spam':1})

In [22]:
sms_messages.head()

Unnamed: 0,label,message,label_number
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [25]:
X=sms_messages.message
y=sms_messages.label_number

In [26]:
print(X.shape,y.shape)

(5572,) (5572,)


In [29]:
from sklearn.model_selection import train_test_split

In [30]:
xtr,xte,ytr,yte=train_test_split(X,y,test_size=0.3,random_state=13)

In [31]:
print(xtr.shape,yte.shape)

(3900,) (1672,)


In [34]:
vector=CountVectorizer()

In [36]:
vector.fit(xtr)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [37]:
xtr_dtmatrix=vector.transform(xtr)

In [38]:
xtr_dtmatrix.shape

(3900, 7125)

In [39]:
# also can be doen in a single step
# by vector.fit_transform(xtr)

In [40]:
xtr_dtmatrix

<3900x7125 sparse matrix of type '<class 'numpy.int64'>'
	with 51886 stored elements in Compressed Sparse Row format>

In [41]:
xte_dtmatrix=vector.transform(xte)

In [43]:
xte_dtmatrix

<1672x7125 sparse matrix of type '<class 'numpy.int64'>'
	with 20507 stored elements in Compressed Sparse Row format>

In [44]:
from sklearn.naive_bayes import MultinomialNB

In [45]:
nb_model=MultinomialNB()

In [46]:
nb_model.fit(xtr_dtmatrix,ytr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [47]:
y_prediction=nb_model.predict(xte_dtmatrix)

In [48]:
from sklearn import metrics

In [49]:
metrics.accuracy_score(yte,y_prediction)

0.9808612440191388

In [50]:
metrics.confusion_matrix(yte,y_prediction)

array([[1431,    8],
       [  24,  209]], dtype=int64)

In [51]:
#logistic regression model


In [52]:
from sklearn.linear_model import LogisticRegression

In [53]:
log_model=LogisticRegression()

In [55]:
log_model.fit(xtr_dtmatrix,ytr)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
y_prediction2=log_model.predict(xte_dtmatrix)

In [58]:
metrics.accuracy_score(yte,y_prediction2)

0.979066985645933

In [59]:
metrics.accuracy_score(yte,y_prediction)

0.9808612440191388

In [61]:
metrics.confusion_matrix(yte,y_prediction2)

array([[1433,    6],
       [  29,  204]], dtype=int64)

In [62]:
metrics.confusion_matrix(yte,y_prediction)

array([[1431,    8],
       [  24,  209]], dtype=int64)

In [64]:
xte[yte<y_prediction] #0,1

1672                              Glad to see your reply.
4622                   Received, understood n acted upon!
4862                               Nokia phone is lovly..
574                                Waiting for your call.
216     Finally the match heading towards draw as your...
991                                          26th OF JULY
4729    I (Career Tel) have added u as a contact on IN...
4702                               I liked the new mobile
Name: message, dtype: object

In [65]:
xte[yte>y_prediction]

2558    This message is brought to you by GMW Ltd. and...
1500    SMS. ac JSco: Energy is high, but u may not kn...
2354    Please CALL 08712402902 immediately as there i...
4527    I want some cock! My hubby's away, I need a re...
3425    Am new 2 club & dont fink we met yet Will B gr...
4298    thesmszone.com lets you send free anonymous an...
3064    Hi babe its Jordan, how r u? Im home from abro...
3391    Please CALL 08712402972 immediately as there i...
731     Email AlertFrom: Jeri StewartSize: 2KBSubject:...
684     Hi I'm sue. I am 20 years old and work as a la...
4821    Check Out Choose Your Babe Videos @ sms.shsex....
4213    Missed call alert. These numbers called but le...
1663    Hi if ur lookin 4 saucy daytime fun wiv busty ...
751     Do you realize that in about 40 years, we'll h...
1940    More people are dogging in your area now. Call...
672     SMS. ac sun0819 posts HELLO:"You seem cool, wa...
1269    Can U get 2 phone NOW? I wanna chat 2 set up m...
4069    TBS/PE

In [67]:
len(xte[yte>y_prediction])

24

In [68]:
xte[731]

'Email AlertFrom: Jeri StewartSize: 2KBSubject: Low-cost prescripiton drvgsTo listen to email call 123'

In [69]:
#most common objects

In [70]:
xtr_words=vector.get_feature_names()

In [71]:
len(xtr_words)

7125

In [72]:
xtr_words[:5]

['00', '000', '000pes', '008704050406', '0089']

In [73]:
xtr_words[500:506]

['5249', '526', '528', '530', '54', '542']

In [74]:
#feature count

In [75]:
nb_model.feature_count_

array([[ 0.,  0.,  1., ...,  1.,  1.,  1.],
       [ 6., 25.,  0., ...,  0.,  0.,  0.]])

In [76]:
nb_model.feature_count_.shape

(2, 7125)

In [77]:
ham_word_count=nb_model.feature_count_[0,:]

In [78]:
spam_word_count=nb_model.feature_count_[1,:]

In [79]:
ham_word_count

array([0., 0., 1., ..., 1., 1., 1.])

In [80]:
ham_word_count.shape

(7125,)

In [81]:
spam_word_count

array([ 6., 25.,  0., ...,  0.,  0.,  0.])

In [82]:
spam_word_count.shape

(7125,)

In [83]:
tokens=pd.DataFrame({'token':xtr_words,'ham':ham_word_count,'spam':spam_word_count})

In [85]:
tokens.head()

Unnamed: 0,token,ham,spam
0,00,0.0,6.0
1,000,0.0,25.0
2,000pes,1.0,0.0
3,008704050406,0.0,1.0
4,0089,0.0,1.0


In [86]:
tokens=tokens.set_index('token')

In [87]:
tokens.head()

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,0.0,6.0
000,0.0,25.0
000pes,1.0,0.0
008704050406,0.0,1.0
0089,0.0,1.0


In [88]:
nb_model.class_count_

array([3386.,  514.])

In [89]:
tokens['ham']=tokens.ham+10

In [90]:
tokens['spam']=tokens.spam+10

In [91]:
tokens.head()

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,10.0,16.0
000,10.0,35.0
000pes,11.0,10.0
008704050406,10.0,11.0
0089,10.0,11.0


In [94]:
tokens['ham']=tokens.ham/nb_model.class_count_[0]

In [95]:
tokens['spam']=tokens.spam/nb_model.class_count_[1]

In [97]:
tokens.head()

Unnamed: 0_level_0,ham,spam
token,Unnamed: 1_level_1,Unnamed: 2_level_1
00,8.722201e-07,0.031128
000,8.722201e-07,0.068093
000pes,9.594421e-07,0.019455
008704050406,8.722201e-07,0.021401
0089,8.722201e-07,0.021401


In [98]:
tokens['ham_to_spam_ratio']=tokens.ham/tokens.spam

In [99]:
tokens.head()

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00,8.722201e-07,0.031128,2.8e-05
000,8.722201e-07,0.068093,1.3e-05
000pes,9.594421e-07,0.019455,4.9e-05
008704050406,8.722201e-07,0.021401,4.1e-05
0089,8.722201e-07,0.021401,4.1e-05


In [100]:
tokens.sort_values('ham_to_spam_ratio',ascending=False).head(20) # most common ham word

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
my,4.8e-05,0.033074,0.001466
but,2.9e-05,0.027237,0.00105
gt,1.9e-05,0.019455,0.001
lt,1.9e-05,0.019455,0.000995
me,4.8e-05,0.05642,0.000852
he,1.6e-05,0.019455,0.000816
ll,1.7e-05,0.021401,0.000803
come,1.6e-05,0.021401,0.000725
ok,1.9e-05,0.027237,0.000692
it,4.5e-05,0.068093,0.000658


In [101]:
tokens.sort_values('ham_to_spam_ratio',ascending=False).tail(20) #most common spam word

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000,8.722201e-07,0.068093,1.3e-05
ringtone,8.722201e-07,0.070039,1.2e-05
16,9.594421e-07,0.077821,1.2e-05
awarded,8.722201e-07,0.07393,1.2e-05
cs,8.722201e-07,0.075875,1.1e-05
1000,8.722201e-07,0.077821,1.1e-05
50,1.133886e-06,0.103113,1.1e-05
co,9.594421e-07,0.087549,1.1e-05
500,8.722201e-07,0.079767,1.1e-05
18,8.722201e-07,0.087549,1e-05


In [104]:
tokens['spam_to_ham_ratio']=tokens.spam/tokens.ham

In [105]:
tokens.sort_values('spam_to_ham_ratio',ascending=False).head(20) #most common spam words

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio,spam-to_ham_ratio,spam_to_ham_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
claim,8.722201e-07,0.171206,5e-06,196287.869261,196287.869261
prize,8.722201e-07,0.155642,6e-06,178443.51751,178443.51751
www,9.594421e-07,0.142023,7e-06,148027.008843,148027.008843
txt,1.657218e-06,0.229572,7e-06,138528.520172,138528.520172
150p,8.722201e-07,0.116732,7e-06,133832.638132,133832.638132
tone,8.722201e-07,0.108949,8e-06,124910.462257,124910.462257
uk,9.594421e-07,0.118677,8e-06,123693.80191,123693.80191
mobile,1.569996e-06,0.180934,9e-06,115244.771725,115244.771725
guaranteed,8.722201e-07,0.099222,9e-06,113757.742412,113757.742412
nokia,9.594421e-07,0.099222,1e-05,103416.129466,103416.129466


In [107]:
tokens.sort_values('spam_to_ham_ratio',ascending=False).tail(20) #most common ham words

Unnamed: 0_level_0,ham,spam,ham_to_spam_ratio,spam-to_ham_ratio,spam_to_ham_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
so,2.9e-05,0.060311,0.000476,2101.72836,2101.72836
da,9e-06,0.019455,0.00048,2084.620532,2084.620532
its,1.5e-05,0.029183,0.000499,2003.482607,2003.482607
home,1e-05,0.019455,0.000502,1991.557115,1991.557115
lor,1e-05,0.019455,0.000511,1956.617517,1956.617517
not,2.8e-05,0.052529,0.00053,1887.921228,1887.921228
she,1.1e-05,0.019455,0.000556,1798.825781,1798.825781
when,1.9e-05,0.033074,0.000572,1747.430759,1747.430759
how,2e-05,0.033074,0.000591,1692.823548,1692.823548
that,3.5e-05,0.058366,0.000604,1656.344531,1656.344531
