# Email spam detection with Machine Learning (TASK-4)

In [1]:
from IPython.display import Image
Image(url='https://www.pantechelearning.com/wp-content/uploads/2021/12/Spam-classification.png', width=600)

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [3]:
ds=pd.read_csv("spam.csv",encoding='latin')
ds

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
ds.sample(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2100,ham,Oh Howda gud gud.. Mathe en samachara chikku:-),,,
4543,ham,when you and derek done with class?,,,
3161,ham,I can't describe how lucky you are that I'm ac...,,,
1710,ham,"(No promises on when though, haven't even gott...",,,
4113,ham,Where are you ? What do you do ? How can you s...,,,
976,ham,Ok i shall talk to him,,,
4957,ham,Why didn't u call on your lunch?,,,
1983,ham,I wnt to buy a BMW car urgently..its vry urgen...,,,
547,ham,"Sorry that took so long, omw now",,,
4596,ham,Full heat pa:-) i have applyed oil pa.,,,


In [5]:
ds.shape

(5572, 5)

In [6]:
print("Rows-->",ds.shape[0])
print("Columns-->",ds.shape[1])

Rows--> 5572
Columns--> 5


In [7]:
ds.isnull()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,False,False,True,True,True
1,False,False,True,True,True
2,False,False,True,True,True
3,False,False,True,True,True
4,False,False,True,True,True
...,...,...,...,...,...
5567,False,False,True,True,True
5568,False,False,True,True,True
5569,False,False,True,True,True
5570,False,False,True,True,True


In [8]:
ds.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [9]:
ds.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [10]:
ds.isna().mean()

v1            0.000000
v2            0.000000
Unnamed: 2    0.991027
Unnamed: 3    0.997846
Unnamed: 4    0.998923
dtype: float64

In [11]:
ds.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=0,inplace=True)

In [12]:
ds

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
ds.shape

(5572, 2)

In [14]:
ds.columns=['spam/ham','SMS']

In [15]:
ds.loc[ds['spam/ham'] == 'spam', 'spam/ham',] = 0
ds.loc[ds['spam/ham'] == 'ham', 'spam/ham',] = 1

In [16]:
ds

Unnamed: 0,spam/ham,SMS
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


# Train the model with spam dataset

In [17]:
x=ds.SMS
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: SMS, Length: 5572, dtype: object

In [18]:
y=ds['spam/ham']
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: spam/ham, Length: 5572, dtype: object

In [19]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=3)

In [20]:
print(x.shape)

(5572,)


In [21]:
print(xtrain.shape)

(4457,)


In [22]:
print(xtest.shape)

(1115,)


In [23]:
print(ytrain.shape)

(4457,)


In [24]:
print(ytest.shape)

(1115,)


In [25]:
xtrain,xtest

(3075    Mum, hope you are having a great day. Hoping t...
 1787                           Yes:)sura in sun tv.:)lol.
 1614    Me sef dey laugh you. Meanwhile how's my darli...
 4304                Yo come over carlos will be here soon
 3266                    Ok then i come n pick u at engin?
                               ...                        
 789                          Gud mrng dear hav a nice day
 968             Are you willing to go for aptitude class.
 1667    So now my dad is gonna call after he gets out ...
 3321    Ok darlin i supose it was ok i just worry too ...
 1688                     Nan sonathaya soladha. Why boss?
 Name: SMS, Length: 4457, dtype: object,
 2632                       I WILL CAL YOU SIR. In meeting
 454     Loan for any purpose å£500 - å£75,000. Homeown...
 983     LOOK AT THE FUCKIN TIME. WHAT THE FUCK YOU THI...
 1282    Ever green quote ever told by Jerry in cartoon...
 4610                                  Wat time Ì_ finish?
               

In [26]:
ytrain,ytest

(3075    1
 1787    1
 1614    1
 4304    1
 3266    1
        ..
 789     1
 968     1
 1667    1
 3321    1
 1688    1
 Name: spam/ham, Length: 4457, dtype: object,
 2632    1
 454     0
 983     1
 1282    1
 4610    1
        ..
 4827    1
 5291    1
 3325    1
 3561    1
 1136    0
 Name: spam/ham, Length: 1115, dtype: object)

In [27]:
feat_vect=TfidfVectorizer(stop_words='english',lowercase=True)
feat_vect

In [28]:
ytrain=ytrain.astype('int')
ytest=ytest.astype('int')

In [29]:
xtrain_vec =feat_vect.fit_transform(xtrain)

In [30]:
xtest_vec =feat_vect.transform(xtest)
print(xtrain)

3075    Mum, hope you are having a great day. Hoping t...
1787                           Yes:)sura in sun tv.:)lol.
1614    Me sef dey laugh you. Meanwhile how's my darli...
4304                Yo come over carlos will be here soon
3266                    Ok then i come n pick u at engin?
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class.
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha. Why boss?
Name: SMS, Length: 4457, dtype: object


In [31]:
xtrain_vec
print("xtrain--->",xtrain_vec)
print("xtest---->",xtest_vec)

xtrain--->   (0, 741)	0.3219352588930141
  (0, 3979)	0.2410582143632299
  (0, 4296)	0.3891385935794867
  (0, 6599)	0.20296878731699391
  (0, 3386)	0.3219352588930141
  (0, 2122)	0.38613577623520473
  (0, 3136)	0.440116181574609
  (0, 3262)	0.25877035357606315
  (0, 3380)	0.21807195185332803
  (0, 4513)	0.2909649098524696
  (1, 4061)	0.380431198316959
  (1, 6872)	0.4306015894277422
  (1, 6417)	0.4769136859540388
  (1, 6442)	0.5652509076654626
  (1, 7443)	0.35056971070320353
  (2, 933)	0.4917598465723273
  (2, 2109)	0.42972812260098503
  (2, 3917)	0.40088501350982736
  (2, 2226)	0.413484525934624
  (2, 5825)	0.4917598465723273
  (3, 6140)	0.4903863168693604
  (3, 1599)	0.5927091854194291
  (3, 1842)	0.3708680641487708
  (3, 7453)	0.5202633571003087
  (4, 2531)	0.7419319091456392
  :	:
  (4452, 2122)	0.31002103760284144
  (4453, 999)	0.6760129013031282
  (4453, 7273)	0.5787739591782677
  (4453, 1762)	0.45610005640082985
  (4454, 3029)	0.42618909997886
  (4454, 2086)	0.3809693742808703
  (

In [32]:
model=LogisticRegression()

In [33]:
model.fit(xtrain_vec,ytrain)

In [34]:
model.score(xtrain_vec,ytrain)

0.9661207089970832

In [35]:
model.score(xtest_vec,ytest)

0.9623318385650225

In [36]:
predict_model=model.predict(xtest_vec)
predict_model

array([1, 1, 1, ..., 1, 1, 1])

In [37]:
accuracy_score(ytest,predict_model)

0.9623318385650225

In [38]:
confusion_matrix(ytest,predict_model)

array([[114,  41],
       [  1, 959]], dtype=int64)

In [39]:
print(classification_report(ytest,predict_model))

              precision    recall  f1-score   support

           0       0.99      0.74      0.84       155
           1       0.96      1.00      0.98       960

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115

