<font size=5>SMS Text classification</font>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Data-Import" data-toc-modified-id="Data-Import-1">Data Import</a></span><ul class="toc-item"><li><span><a href="#Define-some-functions" data-toc-modified-id="Define-some-functions-1.1">Define some functions</a></span></li></ul></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-2">Preprocessing</a></span></li><li><span><a href="#Feature-Extraction" data-toc-modified-id="Feature-Extraction-3">Feature Extraction</a></span><ul class="toc-item"><li><span><a href="#Word-Count" data-toc-modified-id="Word-Count-3.1">Word Count</a></span></li><li><span><a href="#Tf-Idf" data-toc-modified-id="Tf-Idf-3.2">Tf-Idf</a></span></li><li><span><a href="#N-gram" data-toc-modified-id="N-gram-3.3">N-gram</a></span></li></ul></li><li><span><a href="#Text-Classification" data-toc-modified-id="Text-Classification-4">Text Classification</a></span><ul class="toc-item"><li><span><a href="#Naive-Bayes" data-toc-modified-id="Naive-Bayes-4.1"><a href="https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering" target="_blank">Naive Bayes</a></a></span></li><li><span><a href="#SVM" data-toc-modified-id="SVM-4.2">SVM</a></span></li><li><span><a href="#LogisticRegression" data-toc-modified-id="LogisticRegression-4.3">LogisticRegression</a></span></li><li><span><a href="#GBDT" data-toc-modified-id="GBDT-4.4">GBDT</a></span></li></ul></li><li><span><a href="#word2vec" data-toc-modified-id="word2vec-5">word2vec</a></span></li><li><span><a href="#Bert-TensorFlow" data-toc-modified-id="Bert-TensorFlow-6">Bert-TensorFlow</a></span></li></ul></div>

In [1]:
import warnings
warnings.filterwarnings('ignore')
import re
import itertools
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn import svm

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set_style('white') 

# Data Import

In [3]:
data = pd.read_csv('../input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')
#stopword_list = [k.strip() for k in open("E:/MaLearning/souhu/stopwords.txt", encoding='utf8').readlines() if k.strip() != '']
stopword_list = stopwords.words('english')

## Define some functions

In [4]:

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Preprocessing

In [5]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data["Category"] = data["Category"].map({'ham': 0,'spam':1})

In [7]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


#  Feature Extraction

There is several ways to extract features from text data, including word count method and tf-idf encoding. Now I will do both of them and compare their effect of predicting.

## Word Count

In [8]:
description_list = []
for article in data["Message"]:
    article = re.sub("[^a-zA-Z]"," ",article)
    article = article.lower()   # low case letter
    article = word_tokenize(article)
    lemma = WordNetLemmatizer()
    article = [ lemma.lemmatize(word) for word in article]
    article = " ".join(article)
    description_list.append(article) #we hide all word one section
    
    
def text_replace(text):
    '''some text cleaning method'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

<center>
    <img style="border-radius: 0.3125em;
    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
    src="https://i.loli.net/2019/11/18/kdH1gfSlezstUwL.png">
    <br>
    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
    display: inline-block;
    color: #999;
    padding: 2px;">Word Count Vectorizer</div>
</center>

In [9]:
count_vectorizer = CountVectorizer(max_features = 100, stop_words = "english")
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
tokens = count_vectorizer.get_feature_names()

In [10]:
print(type(sparce_matrix))
sparce_matrix = pd.DataFrame(sparce_matrix, columns=tokens)
sparce_matrix.head()

<class 'numpy.ndarray'>


Unnamed: 0,amp,ask,babe,care,cash,claim,com,come,contact,da,...,wat,way,week,win,won,work,www,yeah,year,yes
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Tf-Idf

 Term Frequency-Inverse Document Frequency

In [11]:
vectorizer = TfidfVectorizer(max_features = 100)
tfidfmatrix = vectorizer.fit_transform(description_list)
cname = vectorizer.get_feature_names()
tfidfmatrix = pd.DataFrame(tfidfmatrix.toarray(),columns=cname)
tfidfmatrix.head()

Unnamed: 0,about,all,am,and,any,are,at,back,be,but,...,wa,want,we,week,what,when,will,with,you,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
tfidfmatrix.columns

Index(['about', 'all', 'am', 'and', 'any', 'are', 'at', 'back', 'be', 'but',
       'by', 'call', 'can', 'come', 'da', 'day', 'do', 'don', 'dont', 'for',
       'free', 'from', 'get', 'go', 'going', 'good', 'got', 'gt', 'ha', 'have',
       'he', 'her', 'hi', 'home', 'how', 'if', 'in', 'is', 'it', 'just',
       'know', 'later', 'like', 'll', 'lor', 'love', 'lt', 'me', 'mobile',
       'my', 'need', 'new', 'no', 'not', 'now', 'of', 'ok', 'on', 'one',
       'only', 'or', 'our', 'out', 'phone', 'please', 'pls', 'reply', 'see',
       'send', 'she', 'so', 'sorry', 'still', 'stop', 'take', 'tell', 'text',
       'that', 'the', 'then', 'there', 'they', 'think', 'this', 'time', 'to',
       'today', 'txt', 'up', 'ur', 'wa', 'want', 'we', 'week', 'what', 'when',
       'will', 'with', 'you', 'your'],
      dtype='object')

## N-gram 

In [13]:
count_vectorizer = CountVectorizer(max_features = 100, stop_words = "english",ngram_range=(2, 2),)
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
tokens = count_vectorizer.get_feature_names()
gram2 = pd.DataFrame(sparce_matrix, columns=tokens)
gram2.head()

Unnamed: 0,account statement,attempt contact,await collection,camcorder reply,camera phone,cash prize,chance win,claim code,claim ur,claim valid,...,urgent mobile,valid hr,wan na,want come,want new,wat doing,wat time,week just,won guaranteed,won prize
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Text Classification

## [Naive Bayes](https://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering)

Naive Bayes gives us a baseline accuracy of predicting.

In [14]:

y = data.iloc[:,0].values   
x = sparce_matrix
tfidfx = tfidfmatrix

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 2019)
tf_x_train, tf_x_test, tf_y_train, tf_y_test = train_test_split(tfidfmatrix ,y,
                                                                test_size = 0.3,
                                                                random_state = 2019)

gm_x_train, gm_x_test, gm_y_train, gm_y_test = train_test_split(gram2 ,y,
                                                                test_size = 0.3,
                                                                random_state = 2019)

In [15]:
nb = GaussianNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.9395933014354066
TF-IDF Vectorizer Accuracy Score 0.6495215311004785
bi-gram Vectorizer Accuracy Score 0.9395933014354066


In [16]:
nb = MultinomialNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.9389952153110048
TF-IDF Vectorizer Accuracy Score 0.9449760765550239
bi-gram Vectorizer Accuracy Score 0.9389952153110048


In [17]:
nb = BernoulliNB()
nb.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',nb.score(x_test,y_test))
nb.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',nb.score(tf_x_test,tf_y_test))
nb.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',nb.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.9389952153110048
TF-IDF Vectorizer Accuracy Score 0.965311004784689
bi-gram Vectorizer Accuracy Score 0.9389952153110048


## SVM

In [18]:
%%time
svmmodel = svm.SVC(kernel='linear', C = 1)
svmmodel.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',svmmodel.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',svmmodel.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',svmmodel.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.937799043062201
TF-IDF Vectorizer Accuracy Score 0.9659090909090909
bi-gram Vectorizer Accuracy Score 0.937799043062201
CPU times: user 1.44 s, sys: 16.7 ms, total: 1.46 s
Wall time: 1.4 s


In [19]:
svmmodel = svm.SVC(kernel='linear', C = 1)
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',svmmodel.score(tf_x_test,tf_y_test))

TF-IDF Vectorizer Accuracy Score 0.9659090909090909


## LogisticRegression

In [20]:
%%time
logit = LogisticRegression(random_state=0, solver='lbfgs')
logit.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',logit.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',logit.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',logit.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.9264354066985646
TF-IDF Vectorizer Accuracy Score 0.6602870813397129
bi-gram Vectorizer Accuracy Score 0.9264354066985646
CPU times: user 782 ms, sys: 18.6 ms, total: 801 ms
Wall time: 729 ms


## GBDT

In [21]:
%%time
clf = GradientBoostingClassifier(n_estimators=50)
clf.fit(x_train, y_train)
print('CountVectorizer Accuracy Score',clf.score(x_test,y_test))
svmmodel.fit(tf_x_train, tf_y_train)
print('TF-IDF Vectorizer Accuracy Score',clf.score(tf_x_test,tf_y_test))
svmmodel.fit(gm_x_train, gm_y_train)
print('bi-gram Vectorizer Accuracy Score',clf.score(gm_x_test,gm_y_test))

CountVectorizer Accuracy Score 0.9043062200956937
TF-IDF Vectorizer Accuracy Score 0.7625598086124402
bi-gram Vectorizer Accuracy Score 0.9043062200956937
CPU times: user 1.05 s, sys: 25.7 ms, total: 1.08 s
Wall time: 1.04 s


# word2vec

In [22]:
description_list = []
for article in data["Message"]:
    article = re.sub("[^a-zA-Z]"," ",article)
    article = article.lower() 
    cutWords = [k for k in word_tokenize(article) if k not in stopword_list]
    cutWords = [ lemma.lemmatize(word) for word in cutWords]
    description_list.append(cutWords)
#description_list

In [23]:
def getVector_v2(cutWords, word2vec_model):
    vector_list = [word2vec_model[k] for k in cutWords if k in word2vec_model]
    vector_df = pd.DataFrame(vector_list)
    cutWord_vector = vector_df.mean(axis=0).values
    return cutWord_vector

word2vec_model = Word2Vec(description_list, size=100, iter=10, min_count=20)

In [24]:
vector_list = []
for c in description_list:
    vec = getVector_v2(c, word2vec_model)
    vector_list.append(vec)

In [25]:
X = pd.DataFrame(vector_list)
X.shape

(5572, 100)

In [26]:
Y = data["Category"]
Y = pd.DataFrame(Y)
Y.shape

(5572, 1)

In [27]:
X = X.fillna(X.mean())
Y = Y.dropna()

In [28]:
train_X, test_X, train_y, test_y = train_test_split(X, Y, test_size=0.3)
logistic_model = LogisticRegression()
logistic_model.fit(train_X, train_y)
y_predict = logistic_model.predict(test_X)

print('CountVectorizer Accuracy Score',accuracy_score(y_test, y_predict))
pd.DataFrame(confusion_matrix(y_test,y_predict))

CountVectorizer Accuracy Score 0.7936602870813397


Unnamed: 0,0,1
0,1292,163
1,182,35


In [29]:
clf = GradientBoostingClassifier(n_estimators=50)
gbdt = clf.fit(train_X, train_y)
y_predict = gbdt.predict(test_X)
print('CountVectorizer Accuracy Score',accuracy_score(y_test, y_predict))
pd.DataFrame(confusion_matrix(y_test,y_predict))

CountVectorizer Accuracy Score 0.7876794258373205


Unnamed: 0,0,1
0,1276,179
1,176,41


# Bert-TensorFlow

See this notebook: <https://www.kaggle.com/rikdifos/bert-test>