In [1]:
import numpy as np 
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

SEED = 1

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/domagoj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/domagoj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def find_features(message, word_features):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [3]:
# import data
df = pd.read_table('./datasets/SMSSpamCollection.csv', header = None, encoding = 'utf-8')
df

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# class labels to binary values, ham=0, spam=1
encode = LabelEncoder()
y = encode.fit_transform(df[0])

# store the SMS message data
text_messages = df[1]

# regularni izrazi da zamijenimo email, brojeve, brojeve telefona, url, simbole sa rijecima
# regular expressions to swap email, numbers, phone numbers, urls and simbols
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress')
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')
processed = processed.str.replace(r'£|\$', 'moneysymb')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# remove punctuations
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# combine multiple spaces to single space
processed = processed.str.replace(r'\s+', ' ')

# remove spaces before and after the word
processed = processed.str.replace(r'^\s+|\s+?$', '')

# lowercase
processed = processed.str.lower()

# remove stop-words
stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

all_words = []
for message in processed:
    words = word_tokenize(message)
    for word in words:
        all_words.append(word)
all_words = nltk.FreqDist(all_words)

#make features
word_features = list(i[0] for i in all_words.most_common(1500))

messages = list(zip(processed, y))

# shuffle messages
np.random.shuffle(messages)

# find_features for each SMS
feature_sets = [(find_features(text, word_features), label) for (text, label) in messages]

In [9]:
# split the data into training and testing subsets
train, test = model_selection.train_test_split(feature_sets, test_size = 0.25, random_state=SEED)

# Define models to train
names = ["Naive Bayes", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier", "SVM Linear"]

classifiers = [
    MultinomialNB(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

# train each model, compute accuracy on test set
for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test) * 100
    print("{} accuracy: {}".format(name, accuracy))

Naive Bayes accuracy: 97.70279971284997
Decision Tree accuracy: 95.76453697056712
Random Forest accuracy: 97.70279971284997
Logistic Regression accuracy: 97.77458722182341
SGD Classifier accuracy: 97.27207465900933
SVM Linear accuracy: 97.4156496769562


In [15]:
# Combine all models into an ensemble and then evaluate it

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(train)

accuracy = nltk.classify.accuracy(nltk_model, test) * 100
print("Voting Classifier accuracy: {}".format(accuracy))

# make class label prediction for test set
txt_features, labels = zip(*test)

prediction = nltk_ensemble.classify_many(txt_features)

# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

Voting Classifier accuracy: 97.4156496769562
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1209
           1       0.99      0.84      0.91       184

    accuracy                           0.98      1393
   macro avg       0.98      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,2
actual,spam,29,155
