### Email Spam Detection

In [1]:
# importing important libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# reading the dataset
data = pd.read_csv('table_cut.csv')
data.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,label,path,txt
0,0,0,1,../data/000/000,Received: from hp-5e1fe6310264 ([218.79.188.13...
1,1,1,0,../data/000/001,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...
2,2,2,1,../data/000/002,Received: from 163.con ([61.141.165.252])\n\tb...
3,3,3,1,../data/000/003,Received: from 12.com ([222.50.6.150])\n\tby s...
4,4,4,1,../data/000/004,Received: from dghhkjk.com ([59.36.183.208])\n...
5,5,5,1,../data/000/005,Received: from 163.com ([218.16.68.44])\n\tby ...
6,6,6,0,../data/000/006,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...
7,7,7,1,../data/000/007,Received: from tom.com ([219.133.131.41])\n\tb...
8,8,8,1,../data/000/008,Received: from davidchans.com ([60.162.187.16]...
9,9,9,0,../data/000/009,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...


In [3]:
# finding NaN values
data.isnull().sum()

Unnamed: 0.1    0
Unnamed: 0      0
label           0
path            0
txt             0
dtype: int64

In [4]:
data = data.drop(['Unnamed: 0.1','Unnamed: 0','path'], axis=1)
data.head(10)

Unnamed: 0,label,txt
0,1,Received: from hp-5e1fe6310264 ([218.79.188.13...
1,0,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...
2,1,Received: from 163.con ([61.141.165.252])\n\tb...
3,1,Received: from 12.com ([222.50.6.150])\n\tby s...
4,1,Received: from dghhkjk.com ([59.36.183.208])\n...
5,1,Received: from 163.com ([218.16.68.44])\n\tby ...
6,0,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...
7,1,Received: from tom.com ([219.133.131.41])\n\tb...
8,1,Received: from davidchans.com ([60.162.187.16]...
9,0,Received: from jdl.ac.cn ([159.226.42.8])\n\tb...


### Using CountVectorizer for the Message Column and MultinomialNB as the model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

v = CountVectorizer()

### Categorizing Label and Target Variable

In [6]:
y = data['label'] # target
x = data['txt'] # labels

In [7]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

((51696,), (12924,), (51696,), (12924,))

In [10]:
# # using the CountVectorizer on the xtrain data
# xtrain_new = v.fit_transform(xtrain.values)
# xtrain_new.toarray()[:3]

In [11]:
# fitting the model with xtrain and ytrain
model = MultinomialNB()
model.fit(xtrain_new,ytrain)

In [12]:
# finding the accuracy of the model
xtest_new = v.transform(xtest)
acc = model.score(xtest_new,ytest)
print(f'Accuracy = {acc*100} %')

Accuracy = 98.66914268028474 %


### Using Pipeline makes the model way simpler and easy to code as we do not need to transform the values again and again

In [13]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [14]:
# fitting the model
clf.fit(xtrain,ytrain)

In [15]:
# finding the accuracy of the model
acc2 = clf.score(xtest,ytest)
print(f'Accuracy = {acc2*100} %')

Accuracy = 98.66914268028474 %


#### WOW!!!! we can se that we are getting an accuracy of 98%

## Let's try with some of our own examples and see if they are pedicted correctly or not

In [16]:
email = ['Hey there!! How are you ? ', # Ham
        'Big Offer!!!!! Click now to claim your prize!!!'] # Spam

clf.predict(email)

array([0, 1], dtype=int64)

#### we can see y our results that we are getting 0 as Ham which is correct and we are getting 1 as Spam which is also correct

In [17]:
import pickle
pickle.dump(clf, open('email.pkl', 'wb'))