import sys
import nltk
import sklearn
import pandas
import numpy

print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')

In [None]:
print(df.info())
print(df.head())

In [None]:
classes = df[0]
print(classes.value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

In [None]:
print(Y[:10])

In [None]:
text_messages = df[1]
print(text_messages[:10])

In [None]:
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

In [None]:
print(processed[:10])

In [None]:
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

In [None]:
processed = processed.str.replace(r'£|\$', 'moneysymb')

In [None]:
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')

In [None]:
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [None]:
processed = processed.str.replace(r'[^\w\d\s]', ' ')



In [None]:
processed = processed.str.replace(r'\s+', ' ')

In [None]:
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [None]:
processed = processed.str.lower()
print(processed)

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
from nltk.corpus import stopwords


stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
import nltk
 nltk.download('stopwords')

In [None]:
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [None]:
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [None]:
print (processed)

In [None]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)
print(all_words)
nltk.download(punkt)

In [None]:
nltk.download('punkt')

In [None]:
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)
print(all_words)

In [None]:
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(20)))

In [None]:
word_features = list(all_words.keys())[:1500]

In [None]:
print(word_features)

In [None]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

In [None]:
features = find_features(processed[1])
for key, value in features.items():
    if value == True:
        print (key)

In [None]:
messages = list(zip(processed, Y))

In [None]:
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

In [None]:
featuresets = [(find_features(text), label) for (text, label) in messages]


In [None]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [None]:

print(len(training))
print(len(testing))

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

model.train(training)

accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

In [None]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

In [None]:
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [68]:
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['ham', 'spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']])

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1208
          1       1.00      0.91      0.95       185

avg / total       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1208,0
actual,spam,17,168
