# Email Spam Detection


## Importing Libraries


In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


## Reading the Dataset


In [4]:
messages = pd.read_csv('../datasets/spam.csv', encoding='ISO-8859-1')
messages.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
ps = PorterStemmer()
lemmatize = WordNetLemmatizer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['v2'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word)
              for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)


In [6]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['v1'])
y = y.iloc[:, 1].values


## Splitting the Dataset into Train and Test Datasets


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


## Fitting the Train Dataset Into Multinomial Naive Bayes Model


In [8]:
model = MultinomialNB().fit(X_train, y_train)


## Using Test Dataset for Prediction


In [9]:
y_pred = model.predict(X_test)


## Measuring the Performance of the Model


### Confusion Matrix

In [10]:
print(confusion_matrix(y_test, y_pred))


[[962  11]
 [  8 134]]


### Accuracy Score


In [11]:
print(f'Accuracy Score {accuracy_score(y_test, y_pred)}')


Accuracy Score 0.9829596412556054


### Classification Report


In [12]:
print(f'Classification report: {classification_report(y_test, y_pred)}')


Classification report:               precision    recall  f1-score   support

           0       0.99      0.99      0.99       973
           1       0.92      0.94      0.93       142

    accuracy                           0.98      1115
   macro avg       0.96      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115

