## ------- Import necessary libraries ----------

In [4]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

### ------------- Load Dataset ---------------

In [6]:
Dataset = pd.read_table("smsspamcollection/SMSSpamCollection", header= None, encoding='utf-8')

In [7]:
Dataset.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [11]:
## Class distribution i.e. number of spam and ham
Dataset[0].value_counts()

ham     4825
spam     747
Name: 0, dtype: int64

### Observation:
- Clear unbalanced dataset.


## --------- Preprocessing Data ----------------

In [15]:
# convert class value as binary value i.e. 0 to ham and 1 to spam
# using sklearn encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Classes = encoder.fit_transform(Dataset[0])

In [18]:
print(Classes)
print(Dataset[0])

[0 0 1 ... 0 0 0]
0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object


In [21]:
## store SMS messages
Text_Messages = Dataset[1]
print(Text_Messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


## --------- Perform regular expression ----------
- Help to remove clutter.
- Eg. email address with email , url as website, phone numbers, currency sign etc.
- https://regexlib.com/Search.aspx?k=numbers&c=-1&m=-1&ps=20

In [28]:
## Replace email address as 'emailaddress'
Text_Messages_Processed = Text_Messages.str.replace('^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$','emailaddress')

## Replace web address as 'weburls'
Text_Messages_Processed = Text_Messages_Processed.str.replace('^#?([a-f]|[A-F]|[0-9]){3}(([a-f]|[A-F]|[0-9]){3})?$','weburls')

## replace currency symbol with 'CurrencySymbol'
Text_Messages_Processed = Text_Messages_Processed.str.replace('£|\$','CurrencySymbol')

## replace 10 digit phone_number
Text_Messages_Processed = Text_Messages_Processed.str.replace('^([\+][0-9]{1,3}([ \.\-])?)?([\(]{1}[0-9]{3}[\)])?([0-9A-Z \.\-]{1,32})((x|ext|extension)?[0-9]{1,4}?)$','phone_number')

## Normal numbers with number

Text_Messages_Processed = Text_Messages_Processed.str.replace('\d+(\.\d+)?','number')


## Remove punctuation with space
Text_Messages_Processed = Text_Messages_Processed.str.replace('[^\w\d\s]',' ')

## replace white space between terms with single space
Text_Messages_Processed = Text_Messages_Processed.str.replace('\s+',' ')

## Remove leading and trailing whitespaces
Text_Messages_Processed = Text_Messages_Processed.str.replace('^\s+|\s+?$','')




In [29]:
Text_Messages_Processed

0       Go until jurong point crazy Available only in ...
1                                 Ok lar Joking wif u oni
2       Free entry in number a wkly comp to win FA Cup...
3             U dun say so early hor U c already then say
4       Nah I don t think he goes to usf he lives arou...
                              ...                        
5567    This is the numbernd time we have tried number...
5568                  Will ü b going to esplanade fr home
5569    Pity was in mood for that So any other suggest...
5570    The guy did some bitching but I acted like i d...
5571                            Rofl Its true to its name
Name: 1, Length: 5572, dtype: object

In [31]:
## change all characters in text message in lower case
Text_Messages_Processed = Text_Messages_Processed.str.lower()
Text_Messages_Processed



0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object

In [33]:
## Now remove stop words from text messages
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

Text_Messages_Processed = Text_Messages_Processed.apply(lambda x: ' '.join(
term for term in x.split() if term not in stop_words))



In [37]:
## Extract stem i.e ing after words
pstemmer = nltk.PorterStemmer()

Text_Messages_Processed = Text_Messages_Processed.apply(lambda x: ' '.join(
pstemmer.stem(term) for term in x.split()))

In [39]:
Text_Messages_Processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbernd time tri number contact u u currencys...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object

## ----------- Feature Generation ------------

In [43]:
from nltk.tokenize import word_tokenize

all_words = []

for message in Text_Messages_Processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

Text_words = nltk.FreqDist(all_words)
Text_words

FreqDist({'number': 2758, 'u': 1207, 'call': 674, 'go': 456, 'get': 452, 'ur': 391, 'gt': 318, 'lt': 316, 'come': 304, 'currencysymbolnumb': 303, ...})

In [50]:
# use the 1500 most common words as features
word_features = list(all_words.keys())[:1500]

AttributeError: 'list' object has no attribute 'keys'

In [49]:

def find_features(text):
    words = word_tokenize(text)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

# Lets see an example!
features = find_features(Text_Messages_Processed[0])
for key, value in features.items():
    if value == True:
        print(key)

NameError: name 'word_features' is not defined