In [None]:
# Sources
# https://medium.com/@randerson112358/email-spam-detection-using-python-machine-learning-abe38c889855
# https://www.kaggle.com/balakishan77/spam-or-ham-email-classification

#Import all necessary libraries
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
import string

dataset = pd.read_csv('emails.csv')

#Exploratory Data Analysis
dataset.drop_duplicates(inplace = True)
print(dataset.head(4))
print(dataset.columns)
print(dataset.shape)

nltk.download('stopwords')

# Tokenization Process
def nlp_processing(inText):
  remove_punc = [char for char in inText if char not in string.punctuation]
  remove_punc = ''.join(remove_punc)
  remove_stop = [word for word in remove_punc.split() if word.lower() not in stopwords.words('english')]

  return remove_stop

print(dataset['text'].head().apply(nlp_processing))

# Use CountVectorizer to convert text into tokens
text_tokenized = CountVectorizer(analyzer=nlp_processing).fit_transform(dataset['text'])

# Split data into test and train sets: 80% train and 20% test
x_train, x_test, y_train, y_test = train_test_split(text_tokenized, dataset['spam'], test_size=0.2, random_state=0)

# Create and train Naive Bayes classifier
ML_classifier = MultinomialNB()
ML_classifier.fit(x_train, y_train)

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
Index(['text', 'spam'], dtype='object')
(5695, 2)
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
0    [Subject, naturally, irresistible, corporate, ...
1    [Subject, stock, trading, gunslinger, fanny, m...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, money, get, software, cds, software,...
Name: text, dtype: object


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
# Evaluating training prediction
prediction = ML_classifier.predict(x_train)
print('Train Accuracy: ', accuracy_score(y_train, prediction))

Accuracy:  0.9971466198419666


In [None]:
# Evaluating testing prediction
prediction_test =  ML_classifier.predict(x_test)
print('Test Accuracy: ', accuracy_score(y_test, prediction_test))

Test Accuracy:  0.9920983318700615
