### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [40]:
import pandas as pd

# SMS Spam Collection Data Set
# https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
df = pd.read_table('SMSSpamCollection',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
df['label'] = df.label.map({'ham': 0, 'spam': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [42]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [43]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


In [44]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


In [45]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b") 
counts = count_vect.fit_transform(df['message'])  
print counts

  (0, 7712)	1
  (0, 1146)	1
  (0, 3387)	1
  (0, 7127)	1
  (0, 2028)	1
  (0, 1747)	1
  (0, 4272)	1
  (0, 7922)	1
  (0, 3424)	1
  (0, 1749)	1
  (0, 3871)	1
  (0, 5290)	1
  (0, 1339)	1
  (0, 2247)	1
  (0, 5633)	1
  (0, 4127)	1
  (0, 7494)	1
  (0, 3335)	1
  (1, 5287)	1
  (1, 7832)	1
  (1, 4093)	1
  (1, 4307)	1
  (1, 5255)	1
  (2, 71)	1
  (2, 1220)	1
  :	:
  (5570, 2759)	1
  (5570, 1776)	1
  (5570, 6593)	1
  (5570, 1772)	1
  (5570, 7531)	1
  (5570, 2491)	1
  (5570, 5047)	1
  (5570, 1462)	1
  (5570, 7106)	1
  (5570, 3104)	1
  (5570, 6584)	1
  (5570, 4395)	1
  (5570, 3822)	1
  (5570, 1160)	1
  (5570, 7751)	1
  (5570, 3986)	1
  (5570, 3558)	1
  (5570, 7233)	1
  (5570, 3147)	1
  (5570, 3871)	1
  (5571, 6112)	1
  (5571, 7363)	1
  (5571, 4969)	1
  (5571, 3986)	2
  (5571, 7233)	1


In [46]:
counts.shape

(5572, 8165)

In [56]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

0.985663082437276
[[477   5]
 [  3  73]]


In [60]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.982078853046595
[[480   6]
 [  4  68]]
0.985663082437276
[[471   7]
 [  1  79]]
0.9838709677419355
[[473   5]
 [  4  76]]
0.978494623655914
[[478  10]
 [  2  68]]
0.9838709677419355
[[471   8]
 [  1  78]]
0.974910394265233
[[479   9]
 [  5  65]]
0.9838709677419355
[[484   5]
 [  4  65]]
0.9731182795698925
[[467   7]
 [  8  76]]
0.974910394265233
[[476   3]
 [ 11  68]]
0.985663082437276
[[492   6]
 [  2  58]]
average perfromance
0.9806451612903226


In [61]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.9766816143497757
[[952  13]
 [ 13 137]]
0.9775784753363229
[[951  10]
 [ 15 139]]
0.9766816143497757
[[952  13]
 [ 13 137]]
0.9802690582959641
[[952  13]
 [  9 141]]
0.9757847533632287
[[936  21]
 [  6 152]]
0.9748878923766816
[[954  15]
 [ 13 133]]
0.9766816143497757
[[953  14]
 [ 12 136]]
0.9748878923766816
[[950  17]
 [ 11 137]]
0.9730941704035875
[[935  18]
 [ 12 150]]
0.9802690582959641
[[970  15]
 [  7 123]]
average perfromance
0.9766816143497756


In [62]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.9727207465900933
[[2380   40]
 [  36  330]]
0.9777458722182341
[[2376   27]
 [  35  348]]
0.9770279971284996
[[2389   31]
 [  33  333]]
0.9752333094041636
[[2360   38]
 [  31  357]]
0.9748743718592965
[[2381   32]
 [  38  335]]
0.9788226848528356
[[2387   26]
 [  33  340]]
0.9748743718592965
[[2379   25]
 [  45  337]]
0.9734386216798278
[[2385   31]
 [  43  327]]
0.9773869346733668
[[2400   28]
 [  35  323]]
0.9784637473079684
[[2383   30]
 [  30  343]]
average perfromance
0.9760588657573581
