In [1]:
#Data manipulation
import numpy as np 
import pandas as pd 
#Text processing
import nltk
from nltk.corpus import stopwords
#Necessary for anaylze function
import string

In [2]:
#Dataset
df = pd.read_csv('emails.csv')
df.head(10)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [3]:
#Number of rows & columns
df.shape

(5728, 2)

In [4]:
#Column names
df.columns

Index(['text', 'spam'], dtype='object')

In [5]:
#Cleaning up data
df.drop_duplicates(inplace = True)

In [6]:
#Number of rows & columns
df.shape

(5695, 2)

In [7]:
#Checking for useless data
df.isnull().sum()

text    0
spam    0
dtype: int64

In [8]:
#Download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexreyes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
#Function to process words
def analyze(text):
    #Removing punctuation
    tokens = [char for char in text if char not in string.punctuation]
    tokens = ''.join(tokens)
    #Removing stopwords
    words = [word for word in tokens.split() if word.lower() not in stopwords.words('english')]
    #Return list of words
    return words

In [10]:
#Display list
df['text'].head(10).apply(analyze)

0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
5    [Subject, great, nnews, hello, welcome, medzon...
6    [Subject, hot, play, motion, homeland, securit...
7    [Subject, save, money, buy, getting, thing, tr...
8    [Subject, undeliverable, home, based, business...
9    [Subject, save, money, buy, getting, thing, tr...
Name: text, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=analyze).fit_transform(df['text'])

In [12]:
#70/30 Train/Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['spam'], test_size = 0.30, random_state = 0)

In [13]:
#Get the shape of messages_bow
messages_bow.shape

(5695, 37229)

In [14]:
#Machine Learning
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
#Training data
#Print predictions
print(classifier.predict(X_train))
#Print values
print(y_train.values)

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [17]:
#Training data evaluation
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3022
           1       0.99      1.00      0.99       964

    accuracy                           1.00      3986
   macro avg       0.99      1.00      1.00      3986
weighted avg       1.00      1.00      1.00      3986

Confusion Matrix: 
 [[3011   11]
 [   0  964]]

Accuracy:  0.9972403411941796


In [18]:
#Test data
#Print predictions
print('Predicted value:',classifier.predict(X_test))
#Print Label
print('Actual value:',y_test.values)

Predicted value: [1 0 0 ... 0 0 1]
Actual value: [1 0 0 ... 0 0 0]


In [19]:
#Test data evaluation
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Accuracy:', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1305
           1       0.97      0.99      0.98       404

    accuracy                           0.99      1709
   macro avg       0.98      0.99      0.99      1709
weighted avg       0.99      0.99      0.99      1709

Confusion Matrix: 
 [[1292   13]
 [   4  400]]

Accuracy: 0.9900526623756583
