## Data: https://www.kaggle.com/adarshkumarjha/email-spam-detection-sentiment-analysis

In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import *
from sklearn.metrics import *
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import string
from nltk.tokenize import word_tokenize
import math
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from nltk import pos_tag
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
stop_words=set(stopwords.words('english'))

In [3]:
df=pd.read_csv('emails.csv')

In [4]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [5]:
for i in range(len(df)):
    df.text[i]=word_tokenize(df.text[i])

In [6]:
df.head()

Unnamed: 0,text,spam
0,"[Subject, :, naturally, irresistible, your, co...",1
1,"[Subject, :, the, stock, trading, gunslinger, ...",1
2,"[Subject, :, unbelievable, new, homes, made, e...",1
3,"[Subject, :, 4, color, printing, special, requ...",1
4,"[Subject, :, do, not, have, money, ,, get, sof...",1


In [7]:
filtered_sentence=[]
for i in range(len(df)):
    for word in df.text[i]:
        if word not in stop_words:
            filtered_sentence.append(word)
    df.text[i]=filtered_sentence
    filtered_sentence=[]

In [8]:
df.head()

Unnamed: 0,text,spam
0,"[Subject, :, naturally, irresistible, corporat...",1
1,"[Subject, :, stock, trading, gunslinger, fanny...",1
2,"[Subject, :, unbelievable, new, homes, made, e...",1
3,"[Subject, :, 4, color, printing, special, requ...",1
4,"[Subject, :, money, ,, get, software, cds, !, ...",1


In [9]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [10]:
word_stems = []
for i in range(len(df)):
    for word in df.text[i]:
        word_stems.append(stemmer.stem(word))
    df.text[i]=word_stems
    word_stems=[]

In [11]:
df.head()

Unnamed: 0,text,spam
0,"[subject, :, nat, irresist, corp, id, lt, real...",1
1,"[subject, :, stock, trad, gunsl, fanny, merril...",1
2,"[subject, :, unbeliev, new, hom, mad, easy, im...",1
3,"[subject, :, 4, col, print, spec, request, add...",1
4,"[subject, :, money, ,, get, softw, cds, !, sof...",1


In [12]:
import re

In [13]:
all_words=[]
for i in range(len(df)):
    for word in df.text[i]:
        if word not in [':',';',',','.','-','\\','[',']','?','','_','Subject']:
            all_words.append(word.strip().lower())
    df.text[i]=all_words
    all_words=[]

In [14]:
df.head()

Unnamed: 0,text,spam
0,"[subject, nat, irresist, corp, id, lt, real, h...",1
1,"[subject, stock, trad, gunsl, fanny, merril, m...",1
2,"[subject, unbeliev, new, hom, mad, easy, im, w...",1
3,"[subject, 4, col, print, spec, request, addit,...",1
4,"[subject, money, get, softw, cds, !, softw, co...",1


In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [16]:
vectorizer = CountVectorizer(lowercase=False)

In [17]:
df.shape

(5728, 2)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
df.text[0][0:10]

['subject',
 'nat',
 'irresist',
 'corp',
 'id',
 'lt',
 'real',
 'hard',
 'recollect',
 'company']

In [20]:
df.head()

Unnamed: 0,text,spam
0,"[subject, nat, irresist, corp, id, lt, real, h...",1
1,"[subject, stock, trad, gunsl, fanny, merril, m...",1
2,"[subject, unbeliev, new, hom, mad, easy, im, w...",1
3,"[subject, 4, col, print, spec, request, addit,...",1
4,"[subject, money, get, softw, cds, !, softw, co...",1


In [21]:
[word for word in df.text[0]][0:10]

['subject',
 'nat',
 'irresist',
 'corp',
 'id',
 'lt',
 'real',
 'hard',
 'recollect',
 'company']

In [22]:
df[df.spam==1].shape

(1368, 2)

In [23]:
df=df.iloc[0:2700,:]

### New stuff

In [24]:
def process_text(text):
    
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

    return clean_words

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
messages = CountVectorizer(analyzer=process_text).fit_transform(df['text'])

In [26]:
messages.shape

(2700, 2678)

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages, df['spam'], test_size = 0.20, random_state = 15)

In [28]:
y_train.value_counts()

1    1093
0    1067
Name: spam, dtype: int64

In [31]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [32]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1067
           1       1.00      1.00      1.00      1093

    accuracy                           1.00      2160
   macro avg       1.00      1.00      1.00      2160
weighted avg       1.00      1.00      1.00      2160

Confusion Matrix: 
 [[1067    0]
 [   0 1093]]

Accuracy:  1.0


In [33]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_test,pred))
print()
print('Accuracy: ', accuracy_score(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.02      0.03       265
           1       0.51      1.00      0.68       275

    accuracy                           0.52       540
   macro avg       0.76      0.51      0.35       540
weighted avg       0.75      0.52      0.36       540

Confusion Matrix: 
 [[  4 261]
 [  0 275]]

Accuracy:  0.5166666666666667
