In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

In [2]:
spam_df = pd.read_csv("Downloads/spam.csv")
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
def preprocessMessage(words):
    stopWords = set(stopwords.words('English'))
    tokens = word_tokenize(words.lower())
    tokens = [w for w in tokens if not w in stopWords]
    tokens = [w for w in tokens if w.isalnum()]
    return ' '.join(tokens)

In [4]:
msg = 'Do you want a New Nokia 3510i colour phone DeliveredTomorrow? With 300 free minutes to any mobile + 100 free texts + Free Camcorder reply or call 08000930705'
preprocessMessage(msg)

'want new nokia 3510i colour phone deliveredtomorrow 300 free minutes mobile 100 free texts free camcorder reply call 08000930705'

In [5]:
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [6]:
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [7]:
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size = 0.25, random_state = 50)

In [8]:
x_train.describe()

count                       4179
unique                      3922
top       Sorry, I'll call later
freq                          24
Name: Message, dtype: object

In [9]:
nbCV = CountVectorizer()
nb_x_train_count = nbCV.fit_transform(x_train.values)

In [10]:
nb_x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
nbModel = MultinomialNB()
nbModel.fit(nb_x_train_count, y_train)

In [12]:
# pre-test ham
email_ham = ["Hey do you want to grab some dinner tonight?"]
email_ham_count = nbCV.transform(email_ham)
nbModel.predict(email_ham_count)

array([0], dtype=int64)

In [13]:
# pre-test spam
email_spam = ["Claim your prize today!"]
email_spam_count = nbCV.transform(email_spam)
nbModel.predict(email_spam_count)

array([1], dtype=int64)

In [14]:
# test model
nb_x_test_count = nbCV.transform(x_test)
nbScore = nbModel.score(nb_x_test_count, y_test)
print("NB Accuracy Score:", nbScore)

NB Accuracy Score: 0.9842067480258435


In [15]:
svmCV = CountVectorizer()
svm_x_train_count = svmCV.fit_transform(x_train.values)

In [16]:
svm_x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
svmModel = svm.SVC(C = 1000)
svmModel.fit(svm_x_train_count, y_train)

In [18]:
svm_x_test_count = svmCV.transform(x_test)
svmScore = svmModel.score(svm_x_test_count, y_test)
print("SVM Accuracy Score:", svmScore)

SVM Accuracy Score: 0.9827709978463748


In [19]:
nbsvmCV = CountVectorizer()
nbsvm_x_train_count = nbsvmCV.fit_transform(x_train.values)
nbsvm_x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
nbsvmModel = VotingClassifier(estimators=[
    ('naive_bayes', nbModel),
    ('svm', svmModel)
], voting='hard')

In [21]:
nbsvmModel.fit(nbsvm_x_train_count, y_train)

In [22]:
nbsvm_x_test_count = nbsvmCV.transform(x_test)
nbsvmScore = nbsvmModel.score(nbsvm_x_test_count, y_test)
print("NBSVM Accuracy Score:", nbsvmScore)

NBSVM Accuracy Score: 0.9806173725771715


In [23]:
print("SCORE COMPARISONS: SAME DATASET SPLIT")
print("NB Accuracy Score:", nbScore)
print("SVM Accuracy Score:", svmScore)
print("NBSVM Accuracy Score:", nbsvmScore)

SCORE COMPARISONS: SAME DATASET SPLIT
NB Accuracy Score: 0.9842067480258435
SVM Accuracy Score: 0.9827709978463748
NBSVM Accuracy Score: 0.9806173725771715


In [24]:
#Current Training Set vs Another Dataset
spam_df_2 = pd.read_csv("Downloads/lingSpam.csv")
spam_df_2

Unnamed: 0.1,Unnamed: 0,Body,Label
0,0,Subject: great part-time or summer job !\n \n ...,1
1,1,Subject: auto insurance rates too high ?\n \n ...,1
2,2,Subject: do want the best and economical hunti...,1
3,3,Subject: email 57 million people for $ 99\n \n...,1
4,4,Subject: do n't miss these !\n \n attention ! ...,1
...,...,...,...
2600,2600,Subject: computationally - intensive methods i...,0
2601,2601,Subject: books : a survey of american linguist...,0
2602,2602,Subject: wecol ' 98 - - western conference on ...,0
2603,2603,Subject: euralex ' 98 - revised programme\n \n...,0


In [25]:
x_test_2 = spam_df_2.Body
y_test_2 = spam_df_2.Label

x_test_2.fillna('', inplace=True)

print("SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET")

nb_x_test_count_2 = nbCV.transform(x_test_2)
nbScore2 = nbModel.score(nb_x_test_count_2, y_test_2)
print("NB 2 Accuracy Score:", nbScore2)

svm_x_test_count_2 = svmCV.transform(x_test_2)
svmScore2 = svmModel.score(svm_x_test_count_2, y_test_2)
print("SVM 2 Accuracy Score:", svmScore2)

nbsvm_x_test_count_2 = nbsvmCV.transform(x_test_2)
nbsvmScore2 = nbsvmModel.score(nbsvm_x_test_count_2, y_test_2)
print("NBSVM 2 Accuracy Score:", nbsvmScore2)

SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET
NB 2 Accuracy Score: 0.7930902111324376
SVM 2 Accuracy Score: 0.837236084452975
NBSVM 2 Accuracy Score: 0.837236084452975


In [26]:
#Current Training Set vs Another Dataset (Specific Pool of Ad Spam)
spam_df_3 = pd.read_csv("Downloads/LingSpam AD ONLY - lingSpam.csv")
spam_df_3

Unnamed: 0.1,Unnamed: 0,Body,Label,Ad and Ham
0,1,Subject: auto insurance rates too high ?\n \n ...,1,1
1,2,Subject: do want the best and economical hunti...,1,1
2,4,Subject: do n't miss these !\n \n attention ! ...,1,1
3,5,Subject: see amazing world record sex !\n \n a...,1,1
4,6,Subject: we can help you get a loan ! ! !\n \n...,1,1
...,...,...,...,...
2504,2600,Subject: computationally - intensive methods i...,0,1
2505,2601,Subject: books : a survey of american linguist...,0,1
2506,2602,Subject: wecol ' 98 - - western conference on ...,0,1
2507,2603,Subject: euralex ' 98 - revised programme\n \n...,0,1


In [27]:
x_test_adj = spam_df_3.Body
y_test_adj = spam_df_3.Label

x_test_adj.fillna('', inplace=True)

print("SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET (FILTERED FOR ADS)")

nb_x_test_count_adj = nbCV.transform(x_test_adj)
nbScore3 = nbModel.score(nb_x_test_count_adj, y_test_adj)
print("NB 3 Accuracy Score:", nbScore3)

svm_x_test_count_adj = svmCV.transform(x_test_adj)
svmScore3 = svmModel.score(svm_x_test_count_adj, y_test_adj)
print("SVM 3 Accuracy Score:", svmScore3)

nbsvm_x_test_count_adj = nbsvmCV.transform(x_test_adj)
nbsvmScore3 = nbsvmModel.score(nbsvm_x_test_count_adj, y_test_adj)
print("NBSVM 3 Accuracy Score:", nbsvmScore3)

SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET (FILTERED FOR ADS)
NB 3 Accuracy Score: 0.8134715025906736
SVM 3 Accuracy Score: 0.8688720605819051
NBSVM 3 Accuracy Score: 0.8688720605819051


In [28]:
print("RESULTS ACROSS ALL THREE TESTS")
print("-------------------------------------------------------")
print("SCORE COMPARISONS: SAME DATASET WITH TRAINING/TEST SPLIT")
print("NB Accuracy Score:", nbScore)
print("SVM Accuracy Score:", svmScore)
print("NBSVM Accuracy Score: ", nbsvmScore, "\n")

print("SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET")
print("NB 2 Accuracy Score:", nbScore2)
print("SVM 2 Accuracy Score:", svmScore2)
print("NBSVM 2 Accuracy Score:", nbsvmScore2, "\n")

print("SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET (FILTERED FOR ADS)")
print("NB 3 Accuracy Score:", nbScore3)
print("SVM 3 Accuracy Score:", svmScore3)
print("NBSVM 3 Accuracy Score:", nbsvmScore3)
print("-------------------------------------------------------")

RESULTS ACROSS ALL THREE TESTS
-------------------------------------------------------
SCORE COMPARISONS: SAME DATASET WITH TRAINING/TEST SPLIT
NB Accuracy Score: 0.9842067480258435
SVM Accuracy Score: 0.9827709978463748
NBSVM Accuracy Score:  0.9806173725771715 

SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET
NB 2 Accuracy Score: 0.7930902111324376
SVM 2 Accuracy Score: 0.837236084452975
NBSVM 2 Accuracy Score: 0.837236084452975 

SCORE COMPARISONS: ORIGINAL MODEL VS EXTERNAL DATASET (FILTERED FOR ADS)
NB 3 Accuracy Score: 0.8134715025906736
SVM 3 Accuracy Score: 0.8688720605819051
NBSVM 3 Accuracy Score: 0.8688720605819051
-------------------------------------------------------
