### Email Spam Detection

In [1]:
# importing important libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# reading the dataset
data = pd.read_csv('spam.csv')
data.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
# finding NaN values
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [4]:
# Label encoding the Category Column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['Category'] = le.fit_transform(data['Category'])

In [5]:
data.head(10)

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


### Using CountVectorizer for the Message Column and MultinomialNB as the model

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

v = CountVectorizer()

### Categorizing Label and Target Variable

In [7]:
y = data['Category']
x = data['Message']

In [8]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)

In [9]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

((4457,), (1115,), (4457,), (1115,))

In [10]:
# using the CountVectorizer on the xtrain data
xtrain_new = v.fit_transform(xtrain.values)
xtrain_new.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
# fitting the model with xtrain and ytrain
model = MultinomialNB()
model.fit(xtrain_new,ytrain)

In [12]:
# finding the accuracy of the model
xtest_new = v.transform(xtest)
acc = model.score(xtest_new,ytest)
print(f'Accuracy = {acc*100} %')

Accuracy = 99.19282511210761 %


### Using Pipeline makes the model way simpler and easy to code as we do not need to transform the values again and again

In [13]:
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [14]:
# fitting the model
clf.fit(xtrain,ytrain)

In [17]:
# finding the accuracy of the model
acc2 = clf.score(xtest,ytest)
print(f'Accuracy = {acc2*100} %')

Accuracy = 99.19282511210761 %


#### WOW!!!! we can se that we are getting an accuracy of 99%

## Let's try with some of our own examples and see if they are pedicted correctly or not

In [18]:
email = ['Hey there!! How are you ? ', # Ham
        'Big Offer!!!!! Click now to claim your prize!!!'] # Spam

clf.predict(email)

array([0, 1])

#### we can see y our results that we are getting 0 as Ham which is correct and we are getting 1 as Spam which is also correct