#Problem Statement:Predicting the Given Message is Spam Or Ham And To Test Several Slassifiers On The Data Set With Different Features


In [1]:
import sys 
import nltk 
import sklearn 
import pandas as pd
import numpy as np


1. loading the data set "spam.csv"

In [2]:
data = pd.read_csv("spam.csv",header=None,encoding = 'latin-1') 
data.info() 
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
0    5572 non-null object
1    5572 non-null object
2    50 non-null object
3    12 non-null object
4    6 non-null object
dtypes: object(5)
memory usage: 217.7+ KB


Unnamed: 0,0,1,2,3,4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#Selecting The Only The Required Coloumns or Removing The Unnecessary Columns
data = data.iloc[0:,:2]
data = data.rename(columns={0:"class", 1:"text"})
data.head()


Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#Class Distribution 
classes = data['class'] 
print(classes.value_counts())

ham     4825
spam     747
Name: class, dtype: int64


In [5]:
#Data describes 
data.groupby('class').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


2.Preprocess the Data

In [6]:
#Preprocessing the data is an essential step in natural language process. 
#converting class labels to binary values using the LabelEncoder from sklearn 
from sklearn.preprocessing import LabelEncoder 

#class labels to binary values, 0 = ham and 1 = spam
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(Y[:10])

[0 0 1 0 0 1 0 0 1 1]


In [7]:
# store the message data
text_messages = data['text']
print(text_messages[:10])


0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: text, dtype: object


In [8]:
# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()

print(processed)


0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [9]:
# removing stop words from text messages 
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [11]:
#Removing word stems using a Porter stemmer 
ps = nltk.PorterStemmer() 

processed = processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))

3.Generating Features 

In [12]:
from nltk.tokenize import word_tokenize

# create bag-of-words
words_bag = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        words_bag.append(w)
        
words_bag = nltk.FreqDist(words_bag)


In [13]:
print("Number Of Words In A Bag:",len(words_bag)) 
print("Most common words:",words_bag.most_common(15))

Number Of Words In A Bag: 6538
Most common words: [('numbr', 2628), ('u', 1192), ('call', 672), ('go', 453), ('get', 451), ('ur', 385), ('gt', 318), ('lt', 316), ('come', 301), ('ok', 292), ('åmoneysymbnumbr', 288), ('free', 284), ('know', 274), ('day', 273), ('love', 260)]


In [14]:
#Using the 1500 most common words as features 
word_features = list(words_bag.keys())[:1500] 

In [15]:
# The find_features function will determine which of the 1500 word features are contained in the review 
def find_features(message):
    
    words = word_tokenize(message) 
    features = {} 
    for word in word_features: 
        features[word] = (word in words) 
        
    return features

#for an Example  

features = find_features(processed[0]) 
for key,value in features.items():
    if value == True:
        print(key)


go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [16]:
# for all the messages
messages = list(zip(processed, Y))

# seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [17]:
# spliting the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# splitting the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [18]:
print(len(training))
print(len(testing))

4179
1393


4. Scikit-Learn Classifiers with NLTK

In [19]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

model = SklearnClassifier(MultinomialNB()) 


# training the model on the training data
model.train(training)

# testing on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("Naive Bayes Accuracy: {}".format(accuracy))


Naive Bayes Accuracy: 97.91816223977028


In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))


K Nearest Neighbors Accuracy: 93.3237616654702
Decision Tree Accuracy: 97.77458722182341




Random Forest Accuracy: 97.98994974874373




Logistic Regression Accuracy: 98.20531227566404




SGD Classifier Accuracy: 97.98994974874373
Naive Bayes Accuracy: 97.91816223977028
SVM Linear Accuracy: 98.49246231155779


In [21]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))


Voting Classifier: Accuracy: 98.49246231155779


In [22]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)
prediction = nltk_ensemble.classify_many(txt_features)

In [23]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1188
           1       0.99      0.89      0.94       205

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1187,1
actual,spam,23,182
