# Devisree
# EMAIL SPAM DETECTION WITH MACHINE LEARNING

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Examining Data

In [3]:
df=pd.read_csv('spam.csv')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
df.dropna(axis=1,inplace=True)

In [7]:
df.shape

(5572, 2)

In [8]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


### Cleaning Data

In [11]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [12]:
df.duplicated().sum()

403

In [13]:
df.drop_duplicates(keep='first',inplace=True)

In [14]:
df.shape

(5169, 2)

In [15]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
df.loc[df['v1']=='spam','v1',]=int(1)
df.loc[df['v1']=='ham','v1',]=int(0)

In [17]:
df['v1']

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5169, dtype: object

In [18]:
df.rename(columns={'v1':'detected','v2':'message'},inplace=True)

In [19]:
df.head()

Unnamed: 0,detected,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
X=df['message']
Y=df['detected']

In [21]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will �_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: message, Length: 5169, dtype: object

In [22]:
Y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: detected, Length: 5169, dtype: object

### Fitting the model

In [23]:
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x=cv.fit_transform(X)
X_train,X_test,Y_train,Y_test=tts(x,Y,test_size=0.2)

In [24]:
X_train.shape

(4135, 8625)

In [25]:
X_test.shape

(1034, 8625)

In [26]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
Y_train=Y_train.astype('int')
model.fit(X_train,Y_train)

MultinomialNB()

In [27]:
Y_test=Y_test.astype('int')
print(model.score(X_test,Y_test)*100)

97.48549323017409


### Testing Own Data

In [28]:
msg="You are lucky to win 5000 rupees"
data=[msg]
trans=cv.transform(data).toarray()
my_pred=model.predict(trans)
print(my_pred)
if(my_pred==1):
    print("spam")
else:
    print("Ham")

[1]
spam


In [29]:
msg=X[5568]
data=[msg]
trans=cv.transform(data).toarray()
my_pred=model.predict(trans)
print(my_pred)
if(my_pred==1):
    print("spam")
else:
    print("Ham")

[0]
Ham
