In [1]:
import nltk
nltk.download('wordnet')
import pandas as pd

[nltk_data] Downloading package wordnet to /Users/haozhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#nltk.download_shell()

### import the data

In [3]:
mails = [line.rstrip() for line in open('smsspamcollection/SMSSpamCollection')]

In [4]:
mails[1]

'ham\tOk lar... Joking wif u oni...'

### Let's take a look at some mails:

In [5]:
for mail_no, mail in enumerate(mails[:9]):
    print(mail_no, mail)
    print('\n')

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...


1 ham	Ok lar... Joking wif u oni...


2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


3 ham	U dun say so early hor... U c already then say...


4 ham	Nah I don't think he goes to usf, he lives around here though


5 spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv


6 ham	Even my brother is not like to speak with me. They treat me like aids patent.


7 ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune


8 spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code 

 The data in our database has been separated by the category and the content, we will process the data later.

### Now, we read in the data as a dataframe using pandas.

In [6]:
mails = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['spam/ham', 'mail'])

In [7]:
mails.head(10)

Unnamed: 0,spam/ham,mail
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [8]:
import string

In [9]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Firstly, we need to create a base model.

The method here means that we need to at least remove the punctuations, and join them according to the whitespace between
them, because if there's no mistakes in the sentence, we consider the words are seperated by white space. 

In [10]:
def base_process(word):
    nonpunc = [char for char in word if char not in string.punctuation]
    nonpunc = ''.join(nonpunc)
    return nonpunc.split()

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
Bag_trans_base = CountVectorizer(analyzer=base_process).fit(mails['mail'])

In [13]:
print(Bag_trans_base.vocabulary_)



In [14]:
mails_bag_base = Bag_trans_base.transform(mails['mail'])

In [15]:
print('Shape:', mails_bag_base.shape)

Shape: (5572, 11747)


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split



Then I split the data into train and test part, the first part is for train, and the second is 
to test the how the outcome of the train performs

In [17]:
mil_train, mil_test, label_train, label_test = train_test_split(mails['mail'], mails['spam/ham'], test_size = 0.5)

## Scikit-learn Tf-IDF topic model

Now we use the tf-idf to calculate the importance of every word in the corpus

#### If we want to use the tf-idf model, we must use the dictionary of all words with their frequency occurs in the corpus.Just like we did before.

In [18]:
tfidf_transformer_base = TfidfTransformer().fit(mails_bag_base)

#### And now, tf-idf will count the importance of each word according to their frequency with its own equation, the bigger the frequency is doesn't mean it is important, because there are many useless words in the corpus but they are a lot like 'a', 'and', 'very' something like this.

In [19]:
mails_tfidf = tfidf_transformer_base.transform(mails_bag_base)

 We use pipeline to combine three models together.(Data preprocessing model, language model, topic model.)
And we use classification report to test the value of the big model.

In [20]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=base_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

from sklearn.metrics import classification_report
print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.93      1.00      0.96      2411
       spam       1.00      0.53      0.70       375

avg / total       0.94      0.94      0.93      2786



## Second, we want to add method to preprocess the text and then we can get how the methods affect the outcome. 

 The two methods we use are stem and Lemmatization. The goal of these two methods are to remove the prefix and suffix of a word.

### But the lemmatization is to return the word to its root, it's a more dictionary way to complete. If we want the word to return back to its root(lemmatization), we need to know its part of speech. 
### And we need to define a method to get its part of speech.

In [21]:
from nltk.corpus import wordnet
from collections import Counter
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

Stem follows an algorithm with steps to perform on the words which make it faster. 

In [22]:
def text_process_stem(text):
    """
    1.tokenize
    2.stemmenize
    3.remove stopwords
    4.return cleaned text words
    """
    nonpunc = [char for char in text if char not in string.punctuation]
    nonpunc = ''.join(nonpunc)
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(token) for token in nonpunc.split()]
    stopword_remove = [word for word in stemmed if word.lower() not in stopwords.words('english')]
    return stopword_remove

In [23]:
def text_process_lemma(text):
    """
    1.tokenize
    2.lemmatizer
    3.remove stopwords
    4.return cleaned text words
    """
    nonpunc = [char for char in text if char not in string.punctuation]
    nonpunc = ''.join(nonpunc)
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in nonpunc.split()]
    stopword_remove = [word for word in lemmatized if word.lower() not in stopwords.words('english')]
    return stopword_remove

Both of the two methods remove the stop words, because they are useless to the corpus.

Now, lets extract some mails which use these two methods, and compare the difference.

In [24]:
mails['mail'].head(10).apply(text_process_stem)

0    [Go, jurong, point, crazi, avail, onli, bugi, ...
1                         [Ok, lar, joke, wif, u, oni]
2    [free, entri, 2, wkli, comp, win, FA, cup, fin...
3        [U, dun, say, earli, hor, U, c, alreadi, say]
4    [nah, dont, think, goe, usf, live, around, tho...
5    [freemsg, hey, darl, 3, week, word, back, Id, ...
6    [even, brother, like, speak, treat, like, aid,...
7    [per, request, mell, mell, oru, minnaminungint...
8    [winner, valu, network, custom, select, receiv...
9    [mobil, 11, month, U, R, entitl, updat, latest...
Name: mail, dtype: object

This is the output generated from the stem method.

In [25]:
mails['mail'].head(10).apply(text_process_lemma)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, go, usf, life, around, though]
5    [FreeMsg, Hey, darling, 3, week, word, back, I...
6    [Even, brother, like, speak, treat, like, aid,...
7    [per, request, Melle, Melle, Oru, Minnaminungi...
8    [WINNER, value, network, customer, select, rec...
9    [mobile, 11, month, U, R, entitle, Update, lat...
Name: mail, dtype: object

This is the output generated from the lemmatization method.

We can find that the key difference between the word 'crazi' under stem process and 'crazy' under lemmatization process, 'alreadi' and 'already', 'darl' and 'darling', 'entitl' and 'entitle', 'updat' and 'Update', 'mobil' and 'mobile', which can see the lemmatization is more like a dictionary transform.

#### Now we will convert each messages into a vector the Scikit Learn's algorithm models can work with

#### Bag of words model:

1. Count how many times does a word occur in each message.
2. Weight the counts, so that frequent tokens get lower weight.
3. Normalize the vectors to unit length, to abstract from the original text length.

And this model will convert a collection of text documents to a matrix od token counts.
And it will be surely for the scikit learn to output a sparse matrix

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
Bag_trans = CountVectorizer(analyzer=text_process_lemma).fit(mails['mail'])

In [28]:
print(Bag_trans.vocabulary_)

{'Go': 2055, 'jurong': 7070, 'point': 8222, 'crazy': 5561, 'Available': 1105, 'bugis': 5108, 'n': 7713, 'great': 6532, 'world': 10056, 'la': 7167, 'e': 5936, 'buffet': 5107, 'Cine': 1478, 'get': 6437, 'amore': 4629, 'wat': 9891, 'Ok': 3059, 'lar': 7196, 'Joking': 2446, 'wif': 9984, 'u': 9659, 'oni': 7946, 'Free': 1936, 'entry': 6029, '2': 420, 'wkly': 10024, 'comp': 5445, 'win': 9994, 'FA': 1828, 'Cup': 1546, 'final': 6216, 'tkts': 9504, '21st': 440, 'May': 2799, '2005': 427, 'Text': 3948, '87121': 866, 'receive': 8505, 'questionstd': 8429, 'txt': 9649, 'rateTCs': 8464, 'apply': 4702, '08452810075over18s': 73, 'U': 4063, 'dun': 5924, 'say': 8735, 'early': 5939, 'hor': 6740, 'c': 5142, 'already': 4605, 'Nah': 2943, 'dont': 5861, 'think': 9440, 'go': 6469, 'usf': 9750, 'life': 7260, 'around': 4738, 'though': 9454, 'FreeMsg': 1938, 'Hey': 2217, 'darling': 5642, '3': 540, 'week': 9925, 'word': 10046, 'back': 4833, 'Id': 2341, 'like': 7272, 'fun': 6376, 'still': 9158, 'Tb': 3925, 'ok': 7924

In [29]:
for key in Bag_trans.vocabulary_:
    print(key)

Go
jurong
point
crazy
Available
bugis
n
great
world
la
e
buffet
Cine
get
amore
wat
Ok
lar
Joking
wif
u
oni
Free
entry
2
wkly
comp
win
FA
Cup
final
tkts
21st
May
2005
Text
87121
receive
questionstd
txt
rateTCs
apply
08452810075over18s
U
dun
say
early
hor
c
already
Nah
dont
think
go
usf
life
around
though
FreeMsg
Hey
darling
3
week
word
back
Id
like
fun
still
Tb
ok
XxX
std
chgs
send
£150
rcv
Even
brother
speak
treat
aid
patent
per
request
Melle
Oru
Minnaminunginte
Nurungu
Vettam
set
callertune
Callers
Press
9
copy
friend
Callertune
WINNER
value
network
customer
select
receivea
£900
prize
reward
claim
call
09061701461
Claim
code
KL341
Valid
12
hour
mobile
11
month
R
entitle
Update
late
colour
camera
Call
Mobile
Co
FREE
08002986030
Im
gonna
home
soon
want
talk
stuff
anymore
tonight
k
Ive
cry
enough
today
SIX
chance
CASH
100
20000
pound
CSH11
87575
Cost
150pday
6days
16
TsandCs
Reply
HL
4
info
URGENT
1
membership
£100000
Prize
Jackpot
Txt
CLAIM
81010
TC
wwwdbuknet
LCCLTD
POBOX
4403LDNW1A7RW

fat
finger
press
button
Ummmmmaah
Many
HAPPY
BIRTHDAY
tirupur
wwwApplausestorecom
MonthlySubscription50pmsg
max6month
TCsC
web
age16
2stop
famous
develop
ability
listen
unconditionally
temper
self
confidence
MARRIED
Pa
oclock
mine
bash
recovery
rather
cooped
hotel
invitation
Cali
english
bloke
weddin
omw
BTW
alibi
Imagine
sink
pace
cage
cock
surround
remind
Enjoy
cuck
Hurry
weeddeficient
three
Sure
acknowledgement
astoundingly
tactless
generally
faggy
demand
blood
oath
fo
Every
warm
milk
youll
magic
loose
weight
I‘ll
pan
it‘s
cheap
perhaps
that‘s
silly
isn‘t
likely
uv
mutation
Sunscreen
essential
thesedays
lunchyou
onlinewhy
Princess
pic
Aiyo
fast
workin
huh
bao
sugardad
ahgee
meim
brownie
Geeeee
barely
Fuck
Cant
2mrw
ninish
icky
American
freek
callin
Jen
eh
Oooh
ey
anyways
gym
whatever
Daddy
pleasure
slap
dick
WOT
MISSY
Yar
mum
sch
clean
lab
goggles
door
daddy
2000
call09050000327
ring
09050005321
Arngd
marriage
walkin
unfortuntly
snake
bite
dance
frnt
sayin
Bite
izzit
textand
0800298

Sweet
gauge
patty
mondaynxt
vl
Might
ax
chill
6hrs
surgical
emergency
unfold
Crazy
marry
frens
korean
leonas
Fredericksburg
Que
pas
un
buen
tiempo
Xavier
chillin
super
FREE2DAY
St
Georges
JordanTxt
PIC
89080
saucy
celeb4
0870241182716
Bugis
tmrw
Heart
Compass
Soul
Guide
worldgnun
Sent
WAY2SMSCOM
Goodnight
Baaaaabe
youuuuu
ned
tht
witot
main
sweetie
blackberry
buyer
melike
4a
pple
becz
undrstndng
chain
suffer
whn
arrest
Shuhui
suntec
steamboat
moji
yahoo
messenger
tp
HOT
LIVE
FANTASIES
08707509020
20p
1327
Croydon
CR9
5WB
0870k
Bbq
6ish
everyso
often
panicks
outhave
fill
write
Put
Dictionary
3Cover
screen
4Press
5Gently
Bears
Pic
Nick
Tom
Dick
fact
08718730666
Auntie
huai
juan
Linerental
call2optoutLF56
tlk
ideal
path
appear
front
reserve
thirunelvali
tirunelvali
sunday
evei
netno
availablei
tackle
Storming
phne
HELLO
wt
Margaret
girlfrnd
f
Grahmbell
invnted
telphone
MoralOne
4get
h
tonght
plough
pile
ironing
Staying
chinky
ki
wi
nz
unsold
muz
Geelater
aust
bk
cafe
recharge
papa
shld
st

mental
smoothly
challenge
2marrow
hon
pple700
900
nightsExcellent
breakfast
hamper
cc100pmin
reality
daal
POLYS
0870737910216yrs
£150wk
Unni
rechargeRakhesh
lack
particular
dramastorms
forfeit
urself
digi
9pm
fab
coupla
wks
077xxx
09066362206
CAL
SIR
sundayish
prasad
Tiwary
rcbbattle
bang
kochi
cancer
Moms
checkup
aka
pap
smear
gobi
Pandy
4w
technology
todayhe
Em
olowoyey
uscedu
argentina
secretary
taxt
massagetiepos
argh
Lool
shake
booty
timeslil
busyi
sarcasm
scarcasim
naal
eruku
chikkuwat
W4
5WQ
bro
amongst
bros
impressively
sensible
Whens
PARK
69696
Nyt
3lp
£150msg
response
alsoor
danalla
obedient
ft
combination
needy
pout
stomp
northampton
abj
serve
playng
1McFlyAll
Ab
Sara
JorgeShock
SmithSwitch
anna
nagar
Chasing
Yupz
modelsony
ericson
der
luks
modl
cheesy
frosty
witin
PLUS
SPORT
0870141701216
4txt120p
Europe
10th
Sept
09050000555
BA128NNFWFLY150ppm
nudist
theme
YM
pump
petrol
£12
Feb
VALUED
FRNDS
evn
ignore
Evr
signal
neither
unusual
Hugs
snog
Omw
west
palm
printing
optout
8302

bimbo
ugos
Safe
Show
Euro
241
Flag
3Lions
portege
m100
semiobscure
PASS
polyphonic
gprs
repeat
Loosu
De
careless
MySpace
PARTNERS
method
calculation
blur
clothe
jewelry
Block
Breaker
deluxe
feature
graphic
£5
BBDELUXE
lush
fumble
day2find
taunton
weekday
Haiz
Cut
nail
common
asia
greatest
courage
defeat
heartgn
tc
STIL
FUCKED
NITE
WENT
TOBED
430
Beauty
pimple
Natalja
25F
YES440
NO440
wwwSMSacunat27081980
asthma
attack
Ball
spin
Haiyoh
million
02
prsn
somtimes
nothin
Science
sunlight
BCozI
loss
audiitions
relocate
pock
motivating
brison
cap
bullshit
GWR
motherfucker
Kit
Strip
1013
IG11
OJA
08712402578
thesmszonecom
anonymous
mask
messagesim
theredo
abuse
Woodland
avenue
parish
magazine
TA
billy
Awww
useless
loo
helloed
gland
previous
Captain
bcaz
STU
2IM
TRUBLE
MOMENT
EVONE
HATES
EVEN
HELL
AV
WONT
BCK
DAN
mokka
view
dual
hostile
haircut
breezy
09061744553
polyH
1AppleDayNo
Doctor
1Tulsi
LeafDayNo
Cancer
1LemonDayNo
Fat
1Cup
MilkdayNo
Bone
Problms
Litres
WatrDayNo
Diseases
Snd
Care
ALSO


The corpus becomes a dictionary of the words' appearance and its frequency in the corpus.

In [30]:
print(len(Bag_trans.vocabulary_))

10307


In [31]:
mess1 = mails['mail'][2]
print(mess1)

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


#### This is give an example of one mail and count the words frequency and frequency of that words in the whole mails overall.

In [32]:
bow1 = Bag_trans.transform([mess1])
print(bow1)

  (0, 73)	1
  (0, 420)	1
  (0, 427)	1
  (0, 440)	1
  (0, 866)	1
  (0, 1546)	1
  (0, 1828)	2
  (0, 1936)	1
  (0, 2799)	1
  (0, 3948)	1
  (0, 4702)	1
  (0, 5445)	1
  (0, 6029)	2
  (0, 6216)	1
  (0, 8429)	1
  (0, 8464)	1
  (0, 8505)	1
  (0, 9504)	1
  (0, 9649)	1
  (0, 9994)	1
  (0, 10024)	1


The shape of this vector, which means that this is one column in a matrix, because we only choose only one mail to check.

In [33]:
print(bow1.shape)

(1, 10307)


We can easily according to the frequency the word occurs in the corpus to get the word like the 'entry'

In [34]:
Bag_trans.get_feature_names()[6029]

'entry'

In [35]:
mess0 = mails['mail'][5570]
print(mess0)

The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free


In [36]:
bow0 = Bag_trans.transform([mess0])
print(bow0)

  (0, 4501)	1
  (0, 4973)	1
  (0, 5131)	1
  (0, 5989)	1
  (0, 6326)	1
  (0, 6458)	1
  (0, 6569)	1
  (0, 6829)	1
  (0, 6926)	1
  (0, 7272)	1
  (0, 7787)	1
  (0, 9016)	1
  (0, 9659)	1
  (0, 9925)	1


I can check the appearance of the words in any single sentence. And the others are 0, 
means they don't have those words in the sentence.

In [37]:
print(bow0.shape)

(1, 10307)


#### Lets look at the shape of the matrix:

In [38]:
mails_bag = Bag_trans.transform(mails['mail'])
print('Shape:', mails_bag.shape)

Shape: (5572, 10307)


The shape of the matrix, which means give me the all words appear in all the mails.

#### Scikit-learn Tf-IDF topic model

In [39]:
from sklearn.feature_extraction.text import TfidfTransformer

In [40]:
tfidf_transformer = TfidfTransformer().fit(mails_bag)

In [41]:
tfidf_transformer.idf_[Bag_trans.vocabulary_['U']]

3.966206571809915

 This is the importance of the letter "U"

In [42]:
mails_tfidf = tfidf_transformer.transform(mails_bag)

In [43]:
print(mails_tfidf)

  (0, 10056)	0.22851328683149133
  (0, 9891)	0.19320182292856627
  (0, 8222)	0.2206047642981321
  (0, 7713)	0.17267405357648447
  (0, 7167)	0.26744965756318645
  (0, 7070)	0.316581887144617
  (0, 6532)	0.18584279065768133
  (0, 6437)	0.11694033219453179
  (0, 5936)	0.19160269099296262
  (0, 5561)	0.25307940162220666
  (0, 5108)	0.2721821986617856
  (0, 5107)	0.3022116312036372
  (0, 4629)	0.316581887144617
  (0, 2055)	0.2451708790888474
  (0, 1478)	0.316581887144617
  (0, 1105)	0.29201577235390175
  (1, 9984)	0.40103983782285263
  (1, 9659)	0.20138826796353052
  (1, 7946)	0.5048746566867236
  (1, 7196)	0.3771390523485955
  (1, 3059)	0.2915079040425609
  (1, 2446)	0.5625839245085561
  (2, 10024)	0.19189047507643978
  (2, 9994)	0.13698686353239958
  (2, 9649)	0.14057561470333804
  :	:
  (5568, 6469)	0.24367405620663118
  (5568, 6320)	0.48803929715486144
  (5568, 6051)	0.569508095387208
  (5568, 4822)	0.39356061779270535
  (5569, 9238)	0.5029225278766244
  (5569, 7639)	0.4381269476511542


This is output shows me the whole matrix with every words and its frequency occurs in corpus.

### Naive Bayes Alogorithm

In [44]:
from sklearn.naive_bayes import MultinomialNB

#### We use the matrix consists of the importance of each word and their labels to fit in the algorithm of Naive Bayes. 

In [45]:
spam_detect_model = MultinomialNB().fit(mails_tfidf, mails['spam/ham'])

In [46]:
all_pred = spam_detect_model.predict(mails_tfidf)

#### Let's check the outcome.

In [47]:
all_pred

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [48]:
from sklearn.cross_validation import train_test_split

In [49]:
mil_train, mil_test, label_train, label_test = train_test_split(mails['mail'], mails['spam/ham'], test_size = 0.3)

### Naive Bayes Algorithm

### Lemmatization

In [50]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_lemma)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

from sklearn.metrics import classification_report
print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1476
       spam       1.00      0.71      0.83       196

avg / total       0.97      0.97      0.96      1672



### Stem

In [51]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_stem)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1476
       spam       0.99      0.73      0.84       196

avg / total       0.97      0.97      0.97      1672



### GBM Algorithm

### Base model

In [52]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=base_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', GradientBoostingClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

  from numpy.core.umath_tests import inner1d


             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1476
       spam       1.00      0.70      0.82       196

avg / total       0.97      0.96      0.96      1672



### Lemmatization

In [53]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_lemma)),
    ('tfidf', TfidfTransformer()),
    ('classifier', GradientBoostingClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1476
       spam       0.96      0.67      0.79       196

avg / total       0.96      0.96      0.95      1672



### Stem

In [54]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_stem)),
    ('tfidf', TfidfTransformer()),
    ('classifier', GradientBoostingClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.96      0.99      0.98      1476
       spam       0.93      0.72      0.81       196

avg / total       0.96      0.96      0.96      1672



### Randomforest Algorithm

### Base model

In [55]:
from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=base_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1476
       spam       0.99      0.70      0.82       196

avg / total       0.96      0.96      0.96      1672



### Lemmatization

In [56]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_lemma)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1476
       spam       1.00      0.75      0.86       196

avg / total       0.97      0.97      0.97      1672



### Stem

In [57]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_stem)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1476
       spam       0.96      0.76      0.85       196

avg / total       0.97      0.97      0.97      1672



### Logistic Regression

### Base model

In [58]:
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
    ('baw', CountVectorizer(analyzer=base_process)),  
    ('tfidf', TfidfTransformer()),  
    ('model', LogisticRegression()), 
])
pipeline.fit(mil_train, label_train)
pred = pipeline.predict(mil_test)
#Evaluation
print (classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98      1476
       spam       0.99      0.69      0.82       196

avg / total       0.96      0.96      0.96      1672



### Lemmatization

In [59]:
pipeline = Pipeline([
    ('baw', CountVectorizer(analyzer=text_process_lemma)),  
    ('tfidf', TfidfTransformer()),  
    ('model', LogisticRegression()), 
])
pipeline.fit(mil_train, label_train)
pred = pipeline.predict(mil_test)
#Evaluation
print (classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.98      1476
       spam       0.98      0.64      0.77       196

avg / total       0.96      0.96      0.95      1672



### Stem

In [60]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process_stem)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier())
])

pipeline.fit(mil_train, label_train)

pred = pipeline.predict(mil_test)

print(classification_report(label_test, pred))

             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1476
       spam       0.98      0.74      0.84       196

avg / total       0.97      0.97      0.97      1672

