In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import nltk

In [4]:
# Load the dataset
df = pd.read_table('SMSSpamCollection', header=None, encoding='utf-8')

In [5]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.describe()

Unnamed: 0,0,1
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [16]:
#check class distribution
classes = df[0]
text_messages = df[1]
classes.value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

In [10]:
## Data Preprocessing

#convert class labels to 0=ham and 1=spam
from sklearn.preprocessing import LabelEncoder

In [14]:
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

In [15]:
print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [17]:
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [18]:
#use regular expressinos  to replace email adreess, phone no, other numbers, symbols, urls 
# www.regexlib.com

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [20]:
processed[0:10]

0    Go until jurong point crazy Available only in ...
1                              Ok lar Joking wif u oni
2    Free entry in numbr a wkly comp to win FA Cup ...
3          U dun say so early hor U c already then say
4    Nah I don t think he goes to usf he lives arou...
5    FreeMsg Hey there darling it s been numbr week...
6    Even my brother is not like to speak with me T...
7    As per your request Melle Melle Oru Minnaminun...
8    WINNER As a valued network customer you have b...
9    Had your mobile numbr months or more U R entit...
Name: 1, dtype: object

In [21]:
#change words to lowercase

processed = processed.str.lower()
print(processed[:10])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


In [24]:
# Remove stopwords

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [25]:
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [26]:
processed[:10]

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
5    freemsg hey darling numbr week word back like ...
6       even brother like speak treat like aids patent
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile numbr months u r entitled update latest...
Name: 1, dtype: object

In [29]:
# Stemming of words, like we dont need tenses and ing words and sort like that

ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
processed[:10]

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object

In [31]:
from nltk.tokenize import word_tokenize

In [33]:
all_words = []
for msg in processed:
    words = word_tokenize(msg)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [34]:
#Print total np. of words
print(len(all_words))

6554


In [35]:
# Use 1500 mose common for features

words_features = list(all_words.keys())[:1500]

In [39]:
#define find features funtion

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in words_features:
        features[word] = (word in words)
        
    return features

In [40]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [41]:
processed[0] 

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [45]:
#find features for all messages
messages = zip(processed, Y)

seed=1
np.random.seed = seed
np.random.shuffle(messages)

#call find_feature for each of these messages

#featuresets = [(find_features(text), label) for (text, label) in messages]

TypeError: object of type 'zip' has no len()