# Email Spam Detection Using Machine Learning

In [None]:
#Import libraries

#1.Numpy for Mathematical Analysis

#2.Pandas for Data Exploration


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [None]:
#Import the dataset

data=pd.read_csv("spam.csv", sep=',',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
#Drop the unwanted columns and Rename the columns

data=data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)
data=data.rename(columns={"v1":"label","v2":"text"})
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Convert labels into Binary Variables
data['y']=data['label'].map({'ham': 0, 'spam': 1})
data.head()

Unnamed: 0,label,text,y
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
#Shape of the Data

data.shape

(5572, 3)

In [None]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [None]:
y=data['y']
x=data['text']

In [None]:
#Splitting the dataset into Training and Testing Data

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [None]:
#Frequency Distribution count vectorizer method convert all tokenized words to lowercase letters

from sklearn.feature_extraction.text import CountVectorizer
count_vector=CountVectorizer()

In [None]:
training_data=count_vector.fit_transform(x_train).toarray()
testing_data=count_vector.transform(x_test).toarray()

In [None]:
#Fitting the model

lr=LogisticRegression()
lr.fit(training_data,y_train)

In [None]:
y_pred=lr.predict(testing_data)

In [None]:

y_pred

array([0, 1, 0, ..., 1, 0, 0])

In [None]:
# calculating the Accuracy Score
print(accuracy_score(y_test,y_pred))

0.9811659192825112


In [None]:
#Classification Report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       944
           1       1.00      0.88      0.93       171

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
#Confusion Matrix
print(confusion_matrix(y_test,y_pred))

[[944   0]
 [ 21 150]]
