In [61]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

## load dataset

In [62]:
df=pd.read_csv('spam.csv')

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [64]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [65]:
df.drop(df[['Unnamed: 2','Unnamed: 3','Unnamed: 4']], axis=1, inplace=True)

In [66]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
classes=df['v1']
classes.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

## 2.preprocess the data

In [68]:
# converting ham=0, spam=1
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(classes)

In [69]:
y[:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1])

In [70]:
#store the sms data
text_messages=df['v2']
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: v2, dtype: object


In [71]:
#we will use regular expressions to replace email, urls, phone numbers, other numbers, symbols
#replace email with 'emailaddr'
processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')

#replace url with 'webaddress'
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
#replace money symbols with 'moneysymb'
processed=processed.str.replace(r'£|\$','moneysymb')
#replace 10 digit phone numbers with 'phonenumber'
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber')
# replace number with 'number'
processed=processed.str.replace(r'\d+(\.\d+)?','number')

  processed=text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddr')
  processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
  processed=processed.str.replace(r'£|\$','moneysymb')
  processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber')
  processed=processed.str.replace(r'\d+(\.\d+)?','number')


In [72]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')
#replace whitespace between terms with single space
processed=processed.str.replace(r'\s+',' ')
#remove leading and trailing with whitespace
processed=processed.str.replace(r'\s+|\s+?$',' ')

  processed=processed.str.replace(r'[^\w\d\s]',' ')
  processed=processed.str.replace(r'\s+',' ')
  processed=processed.str.replace(r'\s+|\s+?$',' ')


In [73]:
#change str to lower
processed=processed.str.lower()

print(processed)


0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in number a wkly comp to win fa cup...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                 will _ b going to esplanade fr home 
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: v2, Length: 5572, dtype: object


In [74]:
#remove stop words from text messages
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [75]:
#remove word stemming using Porter stemming
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [76]:
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri number wkli comp win fa cup final t...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbernd time tri number contact u u number po...
5568                              _ b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth els nex...
5571                                       rofl true name
Name: v2, Length: 5572, dtype: object

In [77]:
from nltk.tokenize import word_tokenize
all_words=[]
for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
all_words=nltk.FreqDist(all_words)        
        

In [78]:
#print the total words and 15 most common words
print('number of words:{}'.format(len(all_words)))
print('most common words:{}'.format(all_words.most_common(15)))

number of words:6477
most common words:[('number', 3052), ('u', 1192), ('call', 677), ('go', 453), ('get', 451), ('ur', 385), ('gt', 318), ('lt', 316), ('come', 301), ('ok', 292), ('free', 284), ('day', 275), ('know', 274), ('love', 260), ('like', 259)]


In [79]:
#use 1500 most common words as features
word_features=list(all_words.keys())[:1500]

In [80]:
#define a find_features function
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features
#lets see an example
features=find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)
        

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [81]:
#find features for all labels
messages=list(zip(processed, y))
seed=1
np.random.seed=seed
np.random.shuffle(messages)
#call find_features for each message
featuresets=[(find_features(text),label) for (text, label ) in messages]

In [82]:
from sklearn.model_selection import train_test_split
training, testing=train_test_split(featuresets, test_size=0.25, random_state=seed)

In [83]:
print(len(training))
print(len(testing))

4179
1393


## Scikit-learn classifiers with NLTK

In [84]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [85]:
#define models to train
names=['K neares neighbors', 'Decision tree', 'random forest','logistic regression','SGD classifier','naive bayes','svm linear']
classifier=[
   KNeighborsClassifier(),
   DecisionTreeClassifier(),
   RandomForestClassifier(),
   LogisticRegression(), 
   SGDClassifier(max_iter=100) ,
   MultinomialNB(),
   SVC(kernel ='linear') 
    
]
models=zip(names,classifier)

In [86]:
print(models)

<zip object at 0x000001C162DA6880>


In [87]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier
for name, model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model, testing)*100
    print('{} accuracy:{}'.format(name,accuracy))

K neares neighbors accuracy:93.96984924623115
Decision tree accuracy:97.20028715003589
random forest accuracy:98.56424982053123
logistic regression accuracy:98.63603732950466
SGD classifier accuracy:97.77458722182341
naive bayes accuracy:97.5592246949031
svm linear accuracy:98.06173725771716


In [89]:
#ensemble method -voting classifier
from sklearn.ensemble import VotingClassifier
#define models to train
names=['K neares neighbors', 'Decision tree', 'random forest','logistic regression','SGD classifier','naive bayes','svm linear']
classifier=[
   KNeighborsClassifier(),
   DecisionTreeClassifier(),
   RandomForestClassifier(),
   LogisticRegression(), 
   SGDClassifier(max_iter=100) ,
   MultinomialNB(),
   SVC(kernel ='linear') 
    
]
models=list(zip(names,classifier))
nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble, testing)*100
print(' accuracy:{}'.format(accuracy))

 accuracy:98.49246231155779


In [90]:
#make class label prediction for testing set
txt_features, labels=zip(*testing)
prediction=nltk_ensemble.classify_many(txt_features)

In [92]:
#print classification report and confusion matrix
print(classification_report(labels, prediction))
pd.DataFrame(
    confusion_matrix(labels, prediction),
    index=[['actual','actual'],['ham','spam']],
    columns=[['predicted','predicted'],['ham','spam']]
)

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1204
           1       0.99      0.89      0.94       189

    accuracy                           0.98      1393
   macro avg       0.99      0.95      0.97      1393
weighted avg       0.99      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1203,1
actual,spam,20,169
