# Data Acquisition + Preprocessing

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
import string


In [None]:
# Load the dataset
dataset = pd.read_csv('https://raw.githubusercontent.com/OmkarPathak/Playing-with-datasets/master/Email%20Spam%20Filtering/emails.csv')

print(dataset.head(10))
dataset.columns


                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
5  Subject: great nnews  hello , welcome to medzo...     1
6  Subject: here ' s a hot play in motion  homela...     1
7  Subject: save your money buy getting this thin...     1
8  Subject: undeliverable : home based business f...     1
9  Subject: save your money buy getting this thin...     1


Index(['text', 'spam'], dtype='object')

In [None]:
# Print the shape (#rows & #columns)
dataset.shape

(5728, 2)

In [None]:
# Get the columns names
dataset.columns

Index(['text', 'spam'], dtype='object')

In [None]:
# Check for duplicates and remove them
dataset.drop_duplicates(inplace = True)

#show new shape (after remove duplicates)
dataset.shape

(5695, 2)

It remove 33 rows of duplicates email

In [None]:
# Show the # of missing data (NAN, NaN, na) for each column
dataset.isnull().sum()


text    0
spam    0
dtype: int64

In [None]:
# Download the stopwords package
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def processText(text):

  #1 Remove the punctuation
  nopunc = [char for char in text if char not in string.punctuation]
  nopunc = ''.join(nopunc)

  #2 Remove the stopwords (useless words)
  cleanWord = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]


  #3 Return a list of clean text words
  return cleanWord


In [None]:
# Show the tokenization (a list of tokens also called lemmas)
dataset['text'].head(5).apply(processText)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object

In [None]:
# Convert a collection of text to a matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
messageBow = CountVectorizer(analyzer=processText).fit_transform(dataset['text'])


In [None]:
x = messageBow
y = dataset['spam']

In [None]:
# Split the data into training & testing
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.20, random_state=1)

# Naive Bayes



In [None]:
# Create and train the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(xTrain, yTrain)

In [None]:
# Print the predictions
predNB = classifier.predict(xTest)
print(predNB)

print(yTest.values)


[0 0 0 ... 1 1 0]
[0 0 0 ... 1 1 0]


In [None]:
data = {'Predicted Values': predNB, 'Actual Values': yTest.values}
df = pd.DataFrame(data)
print(df)

      Predicted Values  Actual Values
0                    0              0
1                    0              0
2                    0              0
3                    0              0
4                    1              1
...                ...            ...
1134                 1              1
1135                 0              0
1136                 1              1
1137                 1              1
1138                 0              0

[1139 rows x 2 columns]


In [None]:
# Evaluate the model
from sklearn.metrics import accuracy_score
predNB = classifier.predict(xTest)

print('Naive Bayes Accuracy: ', accuracy_score(yTest  , predNB))

Naive Bayes Accuracy:  0.9912203687445127


# SVM


In [None]:
from sklearn.svm import SVC
model = SVC(kernel = 'rbf', random_state=0)
model.fit(xTrain, yTrain)

SVC(random_state=0)

In [None]:
predSVC = model.predict(xTest)
print(predSVC)

print(yTest.values)

[0 0 0 ... 1 1 0]
[0 0 0 ... 1 1 0]


In [None]:
data = {'Predicted Values': predSVC, 'Actual Values': yTest.values}
df = pd.DataFrame(data)
print(df)

      Predicted Values  Actual Values
0                    0              0
1                    0              0
2                    0              0
3                    0              0
4                    1              1
...                ...            ...
1134                 1              1
1135                 0              0
1136                 1              1
1137                 1              1
1138                 0              0

[1139 rows x 2 columns]


In [None]:
from sklearn.metrics import accuracy_score
predSVC = model.predict(xTest)


print('SVC Accuracy: ', accuracy_score(yTest  , predSVC))

SVC Accuracy:  0.971027216856892


In [None]:
compare = {'Predicted Values SVC': predSVC, 'Predicted Values NB': predNB, 'Actual Values': yTest.values}
df1 = pd.DataFrame(compare)
print(df1)

      Predicted Values SVC  Predicted Values NB  Actual Values
0                        0                    0              0
1                        0                    0              0
2                        0                    0              0
3                        0                    0              0
4                        1                    1              1
...                    ...                  ...            ...
1134                     1                    1              1
1135                     0                    0              0
1136                     1                    1              1
1137                     1                    1              1
1138                     0                    0              0

[1139 rows x 3 columns]
