# E-Mail Spam Classifier


## Program to detect email is spam or not.

In [3]:
#importing libraries

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string


In [4]:
#loading data
df= pd.read_csv("spam_or_not_spam.csv")

In [6]:
df.shape


(3000, 2)

In [7]:
df.count


<bound method DataFrame.count of                                                   email  label
0      date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1     martin a posted tassos papadopoulos the greek ...      0
2     man threatens explosion in moscow thursday aug...      0
3     klez the virus that won t die already the most...      0
4      in adding cream to spaghetti carbonara which ...      0
...                                                 ...    ...
2995   abc s good morning america ranks it the NUMBE...      1
2996   hyperlink hyperlink hyperlink let mortgage le...      1
2997   thank you for shopping with us gifts for all ...      1
2998   the famous ebay marketing e course learn to s...      1
2999   hello this is chinese traditional 子 件 NUMBER世...      1

[3000 rows x 2 columns]>

In [8]:
#drop duplicates.
df.drop_duplicates(inplace=True)
df.shape

(2873, 2)

In [9]:
#to check there is null data.
df.isnull().sum()

email    1
label    0
dtype: int64

In [10]:
df=df.dropna()

In [11]:
df.isnull().sum()

email    0
label    0
dtype: int64

### Important part of program begins from here

In [12]:
#download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\123ta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
def process_text(text):
    #1->remove punctuations
    #2->remove stopwords
    #3->return a list of clean text words
    
    #1-->
    nopunc=[char for char in text if char not in string.punctuation]
    nopunc=''.join(nopunc)
    
    #2-->
    clean_words=[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3-->
    return clean_words
    

In [16]:
#tokenization
df['email'].head().apply(process_text)

0    [date, wed, NUMBER, aug, NUMBER, NUMBER, NUMBE...
1    [martin, posted, tassos, papadopoulos, greek, ...
2    [man, threatens, explosion, moscow, thursday, ...
3    [klez, virus, die, already, prolific, virus, e...
4    [adding, cream, spaghetti, carbonara, effect, ...
Name: email, dtype: object

In [21]:
#text conversion to token matrix
from sklearn.feature_extraction.text import CountVectorizer
messages_bow=CountVectorizer(analyzer=process_text).fit_transform(df['email'])

In [24]:
#splitting data to 80 and 20 ratio.
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test= train_test_split(messages_bow,df['label'],test_size=0.20,random_state=0)

In [26]:
messages_bow.shape

(2872, 33759)

In [29]:
#Training Naive-Bayes model.
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB().fit(X_train,y_train)

In [35]:
#prediction ofresults.
print(classifier.predict(X_train))

#printing actual values of output.
print(y_train.values)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


In [42]:
#evaluation of model over training data.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred= classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print("Confusion matrix:",confusion_matrix(y_train,pred))
print()
print("Accuracy:",accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1971
           1       1.00      0.99      0.99       326

    accuracy                           1.00      2297
   macro avg       1.00      1.00      1.00      2297
weighted avg       1.00      1.00      1.00      2297


Confusion matrix: [[1970    1]
 [   3  323]]

Accuracy: 0.9982585981715281


### final model implementation over test data.

In [43]:
#prediction ofresults.
print(classifier.predict(X_test))

#printing actual values of output.
print(y_test.values)

[0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0
 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 0
 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 1 0 0 0 0 

In [44]:
#evaluation of model over test data.
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred= classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print("Confusion matrix:",confusion_matrix(y_test,pred))
print()
print("Accuracy:",accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       474
           1       0.93      0.91      0.92       101

    accuracy                           0.97       575
   macro avg       0.96      0.95      0.95       575
weighted avg       0.97      0.97      0.97       575


Confusion matrix: [[467   7]
 [  9  92]]

Accuracy: 0.9721739130434782
