### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [1]:
import pandas as pd

raw_test_documents = [
    "send us your password",
    "send us your review",
    "review your password",
    "review us",
    "send your password",
    "send us your account"
]

documents_classes =  [1, 0, 0, 1, 1, 1]

test_set = pd.DataFrame({
    "label": documents_classes,
    "message": raw_test_documents
})

df = test_set
df.head()

Unnamed: 0,label,message
0,1,send us your password
1,0,send us your review
2,0,review your password
3,1,review us
4,1,send your password


In [2]:
#df['label'] = df.label.map({'ham': 0, 'spam': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,1,send us your password
1,0,send us your review
2,0,review your password
3,1,review us
4,1,send your password


In [3]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to /home/cegard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,1,"[send, us, your, password]"
1,0,"[send, us, your, review]"
2,0,"[review, your, password]"
3,1,"[review, us]"
4,1,"[send, your, password]"


In [4]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,1,"[send, us, your, password]"
1,0,"[send, us, your, review]"
2,0,"[review, your, password]"
3,1,"[review, us]"
4,1,"[send, your, password]"


In [5]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,1,send us your password
1,0,send us your review
2,0,review your password
3,1,review us
4,1,send your password


In [6]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
counts = count_vect.fit_transform(df['message'])  
print counts

  (0, 1)	1
  (0, 5)	1
  (0, 4)	1
  (0, 3)	1
  (1, 2)	1
  (1, 5)	1
  (1, 4)	1
  (1, 3)	1
  (2, 2)	1
  (2, 1)	1
  (2, 5)	1
  (3, 2)	1
  (3, 4)	1
  (4, 1)	1
  (4, 5)	1
  (4, 3)	1
  (5, 0)	1
  (5, 5)	1
  (5, 4)	1
  (5, 3)	1


In [7]:
counts.shape

(6, 6)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)
print counts

  (0, 3)	0.49340910033240876
  (0, 4)	0.49340910033240876
  (0, 5)	0.4260982255952437
  (0, 1)	0.5757909530054383
  (1, 3)	0.49340910033240876
  (1, 4)	0.49340910033240876
  (1, 5)	0.4260982255952437
  (1, 2)	0.5757909530054383
  (2, 5)	0.46363523955327796
  (2, 1)	0.6265151094125243
  (2, 2)	0.6265151094125243
  (3, 4)	0.6506955769621281
  (3, 2)	0.7593387031634324
  (4, 3)	0.5672690203831101
  (4, 5)	0.4898821745637731
  (4, 1)	0.66198286500351
  (5, 3)	0.4230672894407296
  (5, 4)	0.4230672894407296
  (5, 5)	0.36535244529668764
  (5, 0)	0.7131239218581008


In [9]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

1.0
[[1]]


In [10]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
1.0
[[1]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
1.0
[[1]]
1.0
[[1]]
0.0
[[0 1]
 [0 0]]
average perfromance
0.3


In [11]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.5
[[0 1]
 [0 1]]
0.5
[[0 1]
 [0 1]]
0.5
[[0 1]
 [0 1]]
0.5
[[0 0]
 [1 1]]
0.5
[[0 0]
 [1 1]]
0.5
[[0 1]
 [0 1]]
0.0
[[0 2]
 [0 0]]
0.5
[[0 0]
 [1 1]]
0.5
[[0 0]
 [1 1]]
0.5
[[0 1]
 [0 1]]
average perfromance
0.45


  self.class_log_prior_ = (np.log(self.class_count_) -


In [12]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.0
[[0 0]
 [3 0]]
0.6666666666666666
[[0 1]
 [0 2]]
0.6666666666666666
[[0 1]
 [0 2]]
0.0
[[0 0]
 [3 0]]
0.3333333333333333
[[0 2]
 [0 1]]
0.6666666666666666
[[0 1]
 [0 2]]
0.6666666666666666
[[0 1]
 [0 2]]
0.6666666666666666
[[0 1]
 [0 2]]
0.3333333333333333
[[0 2]
 [0 1]]
0.6666666666666666
[[0 1]
 [0 2]]
average perfromance
0.4666666666666666
