In [19]:
import numpy as np
import pandas as pd
import nltk
import  re
import sklearn

In [45]:
from sklearn.preprocessing  import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load Dataset

In [13]:
data=pd.read_table("SMSSpamCollection",header=None,encoding='utf-8')

  """Entry point for launching an IPython kernel.


In [4]:
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [9]:
print(data[0].value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


# Data Preprocessing

In [14]:
# Load Encoder for classification of ham and spam
encoder=LabelEncoder()

#Use Encoder
data[0]=encoder.fit_transform(data[0])
data.head()

Unnamed: 0,0,1
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
# Separating The Dataset
text_messages= data[1]
classes= data[0]

In [17]:
text_messages.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

In [35]:
# Replace Email Addresses
processed = text_messages.str.replace('^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$','emailaddr')

# Replace Web Urls
web_process= processed.str.replace(r'^[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU)$','webadress')

# Replace Symbols
sym_process = web_process.str.replace(r'@$|$%&','symbol')

# Replace Phone Numbers
phn_process = sym_process.str.replace(r'([0]{1}[6]{1}[-\s]*([1-9]{1}[\s]*){8})|([0]{1}[1-9]{1}[0-9]{1}[0-9]{1}[-\s]*([1-9]{1}[\s]*){6})|([0]{1}[1-9]{1}[0-9]{1}[-\s]*([1-9]{1}[\s]*){7})','phoneno')

# Replace Numbers
no_process=phn_process.str.replace(r'\d+(\.\d+)?','no')


In [36]:
no_process.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in no a wkly comp to win FA Cup fin...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object

In [37]:
# Remove Punctuations
no_pun= no_process.str.replace(r'[^\w\d\s]',' ')

# Replace Double Space with Single Space
sin_proc= no_pun.str.replace(r'\s+',' ')

# Replace Leading and trailing whitespace
final_processing= sin_proc.str.replace(r'^\s+|\s+?$','')

In [38]:
final_processing.head()

0    Go until jurong point crazy Available only in ...
1                              Ok lar Joking wif u oni
2    Free entry in no a wkly comp to win FA Cup fin...
3          U dun say so early hor U c already then say
4    Nah I don t think he goes to usf he lives arou...
Name: 1, dtype: object

In [40]:
# Change to Lower case 

final_processing=final_processing.str.lower()

final_processing.head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in no a wkly comp to win fa cup fin...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: 1, dtype: object

# StopWords Removal

In [42]:
# Remove Stop Words
stop_words= set(stopwords.words('english'))

processed_words= final_processing.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

processed_words.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts nos...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: 1, dtype: object

# Porter Stemmer

In [43]:
ps = nltk.PorterStemmer()

processed_words=processed_words.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

processed_words.head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt nost...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: 1, dtype: object

# Bag of Words

In [47]:
# Create Dictionary
all_words=[]

for messages in processed_words:
    msg=word_tokenize(messages)
    for w in msg:
        all_words.append(w)
# Frequency Distribution

all_words= nltk.FreqDist(all_words)

In [48]:
# Check words 
print("No of words : ",len(all_words))
print("Common Words : ",all_words.most_common(20))

No of words :  6569
Common Words :  [('u', 1207), ('call', 679), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261), ('got', 252), ('time', 252), ('good', 248), ('want', 247), ('text', 231), ('send', 214)]


# Word Features

In [57]:
word_features=list(all_words.keys())[:1600]

def find_features(msg):
    word_sent=word_tokenize(msg)
    features={}
    for words in word_features:
        features[words]= (words in word_sent)
    return features

# Lets check an example
check= find_features(processed_words[5])
for key,val in check.items():
    if val==True:
        print(key)
    

ok
std
freemsg
hey
darl
week
word
back
like
fun
still
tb
xxx
chg
send
rcv


In [58]:
print(processed_words[5])

freemsg hey darl week word back like fun still tb ok xxx std chg send rcv


NameError: name 'features' is not defined