### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [1]:
import pandas as pd

raw_test_documents = [
    "Chinese Beijing Chinese",
    "Chinese Chinese Shanghai",
    "Chinese Macao",
    "Tokyo Japan Chinese"
]

documents_classes =  [1, 1, 1, 0]

test_set = pd.DataFrame({
    "label": documents_classes,
    "message": raw_test_documents
})

df = test_set
df.head()

Unnamed: 0,label,message
0,1,Chinese Beijing Chinese
1,1,Chinese Chinese Shanghai
2,1,Chinese Macao
3,0,Tokyo Japan Chinese


In [2]:
#df['label'] = df.label.map({'ham': 0, 'spam': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,1,chinese beijing chinese
1,1,chinese chinese shanghai
2,1,chinese macao
3,0,tokyo japan chinese


In [3]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to /home/cegard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,1,"[chinese, beijing, chinese]"
1,1,"[chinese, chinese, shanghai]"
2,1,"[chinese, macao]"
3,0,"[tokyo, japan, chinese]"


In [4]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,1,"[chines, beij, chines]"
1,1,"[chines, chines, shanghai]"
2,1,"[chines, macao]"
3,0,"[tokyo, japan, chines]"


In [5]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,1,chines beij chines
1,1,chines chines shanghai
2,1,chines macao
3,0,tokyo japan chines


In [6]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
counts = count_vect.fit_transform(df['message'])  
print counts

  (0, 0)	1
  (0, 1)	2
  (1, 4)	1
  (1, 1)	2
  (2, 3)	1
  (2, 1)	1
  (3, 2)	1
  (3, 5)	1
  (3, 1)	1


In [7]:
counts.shape

(4, 6)

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)
print counts

  (0, 1)	0.7220560017292982
  (0, 0)	0.6918346120039814
  (1, 1)	0.7220560017292982
  (1, 4)	0.6918346120039814
  (2, 1)	0.46263733109032296
  (2, 3)	0.8865476297873808
  (3, 1)	0.34618161159873423
  (3, 5)	0.6633846138519129
  (3, 2)	0.6633846138519129


In [9]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

1.0
[[1]]


In [10]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

1.0
[[1]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
1.0
[[1]]
0.0
[[0 1]
 [0 0]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
average perfromance
0.7


  self.class_log_prior_ = (np.log(self.class_count_) -


In [11]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

1.0
[[1]]
0.0
[[0 1]
 [0 0]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
1.0
[[1]]
average perfromance
0.9


In [12]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = BernoulliNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.5
[[0 1]
 [0 1]]
1.0
[[2]]
1.0
[[2]]
0.5
[[0 1]
 [0 1]]
1.0
[[2]]
0.5
[[0 1]
 [0 1]]
0.5
[[0 1]
 [0 1]]
0.5
[[0 1]
 [0 1]]
0.5
[[0 1]
 [0 1]]
1.0
[[2]]
average perfromance
0.7
