### SPAM Detection - The Naive Bayes Algorithm in Python with Scikit-Learn 
D. Shahrokhian
https://stackabuse.com/the-naive-bayes-algorithm-in-python-with-scikit-learn/

In [4]:
import pandas as pd

raw_test_documents = [
    "Taipei Taiwan",
    "Macao Taiwan Shanghai",
    "Japan Sapporo",
    "Sapporo Osaka Taiwan"
]

documents_classes =  [1, 1, 0, 0]

test_set = pd.DataFrame({
    "label": documents_classes,
    "message": raw_test_documents
})

data = test_set.values
test_set

# SMS Spam Collection Data Set
# https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
df = pd.read_table('data/SMSSpamCollection',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df = test_set
df.head()

Unnamed: 0,label,message
0,1,Taipei Taiwan
1,1,Macao Taiwan Shanghai
2,0,Japan Sapporo
3,0,Sapporo Osaka Taiwan


In [5]:
#df['label'] = df.label.map({'ham': 0, 'spam': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

Unnamed: 0,label,message
0,1,taipei taiwan
1,1,macao taiwan shanghai
2,0,japan sapporo
3,0,sapporo osaka taiwan


In [6]:
# https://www.nltk.org/ Natural Language Toolkit
# Punkt Sentence Tokenizer https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize.punkt
import nltk
nltk.download('punkt')

df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

[nltk_data] Downloading package punkt to /home/cegard/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,message
0,1,"[taipei, taiwan]"
1,1,"[macao, taiwan, shanghai]"
2,0,"[japan, sapporo]"
3,0,"[sapporo, osaka, taiwan]"


In [7]:
# https://www.nltk.org/api/nltk.stem.html
#https://tartarus.org/martin/PorterStemmer/
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x]) 
df.head()

Unnamed: 0,label,message
0,1,"[taipei, taiwan]"
1,1,"[macao, taiwan, shanghai]"
2,0,"[japan, sapporo]"
3,0,"[sapporo, osaka, taiwan]"


In [8]:
# Converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,1,taipei taiwan
1,1,macao taiwan shanghai
2,0,japan sapporo
3,0,sapporo osaka taiwan


In [9]:
# Convert a collection of text documents to a matrix of token counts
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
# to allow one letter words count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(token_pattern = r"(?u)\b\w+\b")
counts = count_vect.fit_transform(df['message'])  
print counts

  (0, 6)	1
  (0, 5)	1
  (1, 4)	1
  (1, 1)	1
  (1, 6)	1
  (2, 3)	1
  (2, 0)	1
  (3, 2)	1
  (3, 3)	1
  (3, 6)	1


In [10]:
counts.shape

(4, 7)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)
print counts

  (0, 5)	0.8429263481500496
  (0, 6)	0.5380289691033573
  (1, 6)	0.41137791133379387
  (1, 1)	0.6445029922609534
  (1, 4)	0.6445029922609534
  (2, 0)	0.7852882757103967
  (2, 3)	0.6191302964899972
  (3, 6)	0.4480997313625986
  (3, 3)	0.5534923152870045
  (3, 2)	0.7020348194149619


In [13]:
# https://stackoverflow.com/questions/28064634/random-state-pseudo-random-numberin-scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69) 

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)  

import numpy as np
predicted = model.predict(X_test)
print(np.mean(predicted == y_test))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predicted))

0.0
[[0 0]
 [1 0]]


In [14]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.0
[[0 0]
 [1 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 1]
 [0 0]]
average perfromance
0.0


In [15]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.2) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 1]
 [0 0]]
0.0
[[0 0]
 [1 0]]
0.0
[[0 1]
 [0 0]]
average perfromance
0.0


In [16]:
per = 0
for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.5) 
    model = MultinomialNB().fit(X_train, y_train)  

    import numpy as np
    predicted = model.predict(X_test)
    print(np.mean(predicted == y_test))
    per += np.mean(predicted == y_test)

    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(y_test, predicted))

print "average perfromance"
print per/10.0

0.5
[[1 0]
 [1 0]]
1.0
[[1 0]
 [0 1]]
0.5
[[1 0]
 [1 0]]
1.0
[[1 0]
 [0 1]]
0.5
[[1 0]
 [1 0]]
1.0
[[1 0]
 [0 1]]
1.0
[[1 0]
 [0 1]]
0.5
[[1 0]
 [1 0]]
0.0
[[0 0]
 [2 0]]
1.0
[[1 0]
 [0 1]]
average perfromance
0.7


  self.class_log_prior_ = (np.log(self.class_count_) -
