# Email SPAM Detection Application

In [2]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from warnings import filterwarnings as fw
fw('ignore')

In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import nltk
from string import punctuation
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [4]:
data = pd.read_csv('SMSSpamCollection.tsv',sep='\t',names = ['type','msg'])
data.head()

Unnamed: 0,type,msg
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    5568 non-null   object
 1   msg     5568 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
data.describe()

Unnamed: 0,type,msg
count,5568,5568
unique,2,5165
top,ham,"Sorry, I'll call later"
freq,4822,30


In [7]:
data.groupby('type').describe()

Unnamed: 0_level_0,msg,msg,msg,msg
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4822,4513,"Sorry, I'll call later",30
spam,746,652,Please call our customer service representativ...,4


In [8]:
for i in data['msg'][data['type']=='spam']:
    print(i)
    print('-')

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
-
WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
-
Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030
-
SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info
-
URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18
-
XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL
-
England v Macedonia - dont miss the goals/team news. Txt ur national team to

In [9]:
data['length'] = data['msg'].str.len()
data.sample(5)

Unnamed: 0,type,msg,length
4195,spam,Want to funk up ur fone with a weekly new tone...,155
5201,spam,Had your mobile 11mths ? Update for FREE to Or...,160
828,ham,Hi mate its RV did u hav a nice hol just a mes...,129
4357,ham,"Night sweet, sleep well! I've just been to see...",117
3112,ham,I wait 4 ü inside da car park...,32


In [10]:
data.groupby('type').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ham,4822.0,71.487764,58.451963,2.0,33.0,52.0,93.0,910.0
spam,746.0,138.659517,28.891361,13.0,133.0,149.0,157.0,223.0


In [11]:
data['word_count'] = data['msg'].str.split().str.len()
data.sample(5)

Unnamed: 0,type,msg,length,word_count
5132,ham,There are some nice pubs near here or there is...,87,17
1421,ham,Yes.. now only saw your message..,33,6
1222,ham,"sir, you will receive the account no another 1...",75,14
1989,spam,Free tones Hope you enjoyed your new content. ...,129,18
5461,ham,Shall I bring us a bottle of wine to keep us a...,87,18


In [12]:
data.groupby('type').describe().T

Unnamed: 0,type,ham,spam
length,count,4822.0,746.0
length,mean,71.487764,138.659517
length,std,58.451963,28.891361
length,min,2.0,13.0
length,25%,33.0,133.0
length,50%,52.0,149.0
length,75%,93.0,157.0
length,max,910.0,223.0
word_count,count,4822.0,746.0
word_count,mean,14.311489,23.900804


In [13]:
data['msg'][(data['word_count']==2)&(data['type']=='spam')]

3738         2/2 146tf150p
3977    ringtoneking 84484
Name: msg, dtype: object

In [14]:
mm = data['msg'][data['length']==max(data['length'])].values[0]
mm

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

In [15]:
data['msg'][data['length']>100]

0       I've been searching for the right words to tha...
1       Free entry in 2 a wkly comp to win FA Cup fina...
5       As per your request 'Melle Melle (Oru Minnamin...
6       WINNER!! As a valued network customer you have...
7       Had your mobile 11 months or more? U R entitle...
                              ...                        
5551    Yeh. Indians was nice. Tho it did kane me off ...
5553    No. I meant the calculation is the same. That ...
5562    REMINDER FROM O2: To get 2.50 pounds free call...
5563    This is the 2nd time we have tried 2 contact u...
5566    The guy did some bitching but I acted like i'd...
Name: msg, Length: 1763, dtype: object

In [16]:
m7 = data['msg'][7]
m7

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [17]:
lemma = WordNetLemmatizer()
stemmer = PorterStemmer()

In [18]:
sw = ["i've", "i'll", "we'll", "'ve"]

In [19]:
def text_process(t):
    all_words = []
    t = t.replace('.','. ').replace('!','! ').replace('?','? ')
    for sent in nltk.sent_tokenize(t.lower()):
        #print(sent,end='\n\n')
        words = nltk.word_tokenize(sent)        
        words = [word for word in words if (word not in stopwords.words('english')) and (word not in sw)]        
        #words = [lemma.lemmatize(word, wordnet.ADJ) for word in words]
        words = [stemmer.stem(word) for word in words]
        words = [word for word in words if word not in punctuation]
        words = [word for word in words if not word.isnumeric()]
        all_words += words    
    all_words = ''.join([ch for ch in ' '.join(all_words) if (ch not in punctuation) and (not ch.isnumeric())])    
    return all_words

In [20]:
m7

'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030'

In [21]:
text7 = text_process(m7)
print(text7)

mobil month u r entitl updat latest colour mobil camera free call mobil updat co free


In [22]:
print(mm)
print()
textm = text_process(mm)
print(textm)

For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later..

love start attract feel need everi time around first thing come thought would start day 

In [23]:
data['msg'].head(3)

0    I've been searching for the right words to tha...
1    Free entry in 2 a wkly comp to win FA Cup fina...
2    Nah I don't think he goes to usf, he lives aro...
Name: msg, dtype: object

In [24]:
data['msg'].head(3).apply(text_process)

0    search right word thank breather promis wont t...
1    free entri wkli comp win fa cup final tkt st m...
2              nah nt think goe usf live around though
Name: msg, dtype: object

In [25]:
tfidfV = TfidfVectorizer()
tf_idf = tfidfV.fit_transform(data['msg'].apply(text_process))

In [26]:
tf_df = pd.DataFrame(tf_idf.toarray(),columns = tfidfV.get_feature_names())
tf_df.head()

Unnamed: 0,aa,aah,aaniy,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zero,zf,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,èn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
tfidfV.get_feature_names()

['aa',
 'aah',
 'aaniy',
 'aaooooright',
 'aathi',
 'ab',
 'abbey',
 'abdomen',
 'abeg',
 'abel',
 'aberdeen',
 'abi',
 'abil',
 'abiola',
 'abj',
 'abl',
 'abnorm',
 'abouta',
 'abroad',
 'absenc',
 'absolut',
 'absolutli',
 'abstract',
 'abt',
 'abta',
 'aburo',
 'abus',
 'ac',
 'academ',
 'acc',
 'accent',
 'accentur',
 'accept',
 'access',
 'accid',
 'accident',
 'accommod',
 'accommodationvouch',
 'accomod',
 'accordin',
 'accordingli',
 'account',
 'accumul',
 'ach',
 'achan',
 'achiev',
 'acid',
 'acknowledg',
 'aclpm',
 'acnt',
 'aco',
 'across',
 'acsmsreward',
 'act',
 'actin',
 'action',
 'activ',
 'actor',
 'actual',
 'acubootydeli',
 'acugoldvik',
 'acuhmmross',
 'acunat',
 'acunataliek',
 'acwicmbcktzr',
 'ad',
 'adam',
 'add',
 'addamsfa',
 'addi',
 'addict',
 'address',
 'adewal',
 'adi',
 'adjust',
 'admin',
 'administr',
 'admir',
 'admiss',
 'admit',
 'ador',
 'adp',
 'adress',
 'adrian',
 'adrink',
 'adsens',
 'adult',
 'advanc',
 'adventur',
 'advic',
 'advis',
 'a

In [28]:
tf_df.shape

(5568, 6533)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
xtrain, xtest, ytrain, ytest = train_test_split(tf_df, data['type'], test_size=0.25, random_state=101)

In [31]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [32]:
model_bnb = BernoulliNB().fit(xtrain,ytrain)
print(model_bnb.score(xtrain,ytrain))
print(model_bnb.score(xtest,ytest))

0.9844348659003831
0.9691091954022989


In [33]:
model_mnb = MultinomialNB().fit(xtrain,ytrain)
print(model_mnb.score(xtrain,ytrain))
print(model_mnb.score(xtest,ytest))

0.9738984674329502
0.9691091954022989


In [34]:
xtrain1,xtest1,ytrain1,ytest1 = train_test_split(data['msg'], data['type'], test_size=0.25, random_state=11)

In [35]:
from sklearn.pipeline import Pipeline

In [36]:
ddd = map(text_process, data['msg'])

In [37]:
TfidfVectorizer().fit_transform(ddd)

<5568x6533 sparse matrix of type '<class 'numpy.float64'>'
	with 45659 stored elements in Compressed Sparse Row format>

In [38]:
model_pipe = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),    
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB())
])

In [39]:
model_pipe.fit(xtrain1,ytrain1)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x000001E0482D44C0>)),
                ('tfidf', TfidfTransformer()), ('clf', BernoulliNB())])

In [40]:
model_pipe.score(xtrain1,ytrain1)

0.8436302681992337

In [41]:
model_pipe.score(xtest1,ytest1)

0.8347701149425287

In [42]:
ypred1 = model_pipe.predict(xtest1)

In [43]:
print(classification_report(ytest1,ypred1))

              precision    recall  f1-score   support

         ham       0.97      0.84      0.90      1206
        spam       0.44      0.82      0.57       186

    accuracy                           0.83      1392
   macro avg       0.70      0.83      0.73      1392
weighted avg       0.90      0.83      0.85      1392

