# Reading a text-based dataset into pandas

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly 
import plotly.graph_objects as go
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
cf.go_offline()

In [2]:
df = pd.read_csv('../input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
df.head()

Remove the Unnamed: 2, Unnamed: 3, Unnamed: 4 columns due to all the entries were null.

In [3]:
# see the null data here
df.isnull().sum()

In [4]:
df.info()

In [5]:
df.drop(['Unnamed: 2','Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [6]:
# search the most relevant message 
df['v2'].describe()

In [7]:
# count of ham and spam
df['v1'].value_counts()

In [8]:
# convert categorical v1 to numerical with new column
df['v1_nm'] = df.v1.map({'ham':0, 'spam':1})
df.head()

In [9]:
# interactive plotly hist plot for numerical vi_nm columns(i,e ham and spam)
df['v1_nm'].iplot(kind='hist')

In [10]:
# creating a new column with message length using v2 column
df['v2_le'] = df.v2.apply(len)
df.head()

In [11]:
# Histogram plot for spam and ham labels with respeect to message length
plt.figure(figsize=(12,8))
df[df['v1']=='ham'].v2_le.plot(bins = 50, kind= 'hist', color='blue', label='ham', alpha=0.75)
df[df['v1']=='spam'].v2_le.plot(bins=50, kind= 'hist', color='red', label = 'spam', alpha=0.75)
plt.legend()
plt.xlabel('Message length')

In [12]:
# describe the ham for some numerical insights
df[df['v1']=='ham'].describe()

In [13]:
# describe the spam some numerical insights
df[df['v1']=='spam'].describe()

In [14]:
# describe the both numerical columns
df.describe()

In [15]:
# see in describe we have 910 word message, let's look at it
df[df['v2_le']==910].v2.iloc[0]

# Text Pre-processing

Our main issue with our data is that it is all in text format (strings). The classification algorithms that we usally use need some sort of numerical feature vector in order to perform the classification task. There are actually many methods to convert a corpus to a vector format. The simplest is the bag-of-words approach, where each unique word in a text will be represented by one number.

In this section we'll convert the raw messages (sequence of characters) into vectors (sequences of numbers).

As a first step, let's write a function that will split a message into its individual words and return a list. We'll also remove very common words, ('the', 'a', etc..). To do this we will take advantage of the NLTK library. It's pretty much the standard library in Python for processing text and has a lot of useful features. We'll only use some of the basic ones here.

Let's create a function that will process the string in the message column, then we can just use apply() in pandas do process all the text in the DataFrame.

First removing punctuation. We can just take advantage of Python's built-in string library to get a quick list of all the possible punctuation:

In [16]:
import string 
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

Now let's "tokenize" these messages. Tokenization is just the term used to describe the process of converting the normal text strings in to a list of tokens (words that we actually want).

In [17]:
df['clean_msg'] = df.v2.apply(text_process)

In [18]:
df.head()

In [19]:
type(stopwords.words('english'))

In [20]:
from collections import Counter

words = df[df['v1']=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
ham_words = Counter()

for msg in words:
    ham_words.update(msg)
    
print(ham_words.most_common(50))    

In [21]:
words = df[df.v1=='spam'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
spam_words = Counter()

for msg in words:
    spam_words.update(msg)
    
print(spam_words.most_common(50))

# Vectorization

Currently, we have the messages as lists of tokens (also known as lemmas) and now we need to convert each of those messages into a vector the SciKit Learn's algorithm models can work with.

Now we'll convert each message, represented as a list of tokens (lemmas) above, into a vector that machine learning models can understand.

We'll do that in three steps using the bag-of-words model:

*    Count how many times does a word occur in each message (Known as term frequency)
*    Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)
*    Normalize the vectors to unit length, to abstract from the original text length (L2 norm)

Let's begin the first step:

Each vector will have as many dimensions as there are unique words in the SMS corpus. We will first use SciKit Learn's CountVectorizer. This model will convert a collection of text documents to a matrix of token counts.

We can imagine this as a 2-Dimensional matrix. Where the 1-dimension is the entire vocabulary (1 row per word) and the other dimension are the actual documents, in this case a column per text message.

In [22]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = df.clean_msg
y = df.v1_nm
print(X.shape)
print(y.shape)

In [23]:
# split X and y into training and testing sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

There are a lot of arguments and parameters that can be passed to the CountVectorizer. In this case we will just specify the analyzer to be our own previously defined function:

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

In [25]:
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)



# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

# examine the document-term matrix
X_train_dtm

In [26]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

# Building and evaluating a model

We will use multinomial Naive Bayes:

*    The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.


In [28]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [29]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

In [30]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [31]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

In [32]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

In [33]:
# print message text for false positives (ham incorrectly classifier)
# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]

In [34]:
# print message text for false negatives (spam incorrectly classifier)
X_test[y_pred_class < y_test]

In [35]:
# example of false negative 
X_test[4949]

In [36]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [37]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

In [38]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])
pipe.fit(X_train, y_train)

In [39]:
y_pred = pipe.predict(X_test)

In [40]:
metrics.accuracy_score(y_test, y_pred)

In [41]:
metrics.confusion_matrix(y_test, y_pred)

# Comparing models

We will compare multinomial Naive Bayes with logistic regression:

*    Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.


In [42]:
# import an instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')

In [43]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

In [44]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [45]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

In [46]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

In [47]:
metrics.confusion_matrix(y_test, y_pred_class)

In [48]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

# Tuning the vectorizer

Thus far, we have been using the default parameters of CountVectorizer:

In [49]:
# show default parameters for CountVectorizer
vect

However, the vectorizer is worth tuning, just like a model is worth tuning! Here are a few parameters that you might want to tune:

*    stop_words: string {'english'}, list, or None (default)
*        If 'english', a built-in stop word list for English is used.
*        If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
*        If None, no stop words will be used.

In [50]:
# remove English stop words
vect = CountVectorizer(stop_words='english')

*    ngram_range: tuple (min_n, max_n), default=(1, 1)
*        The lower and upper boundary of the range of n-values for different n-grams to be extracted.
*        All values of n such that min_n <= n <= max_n will be used.

In [51]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))

*    max_df: float in range [0.0, 1.0] or int, default=1.0
*        When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
*        If float, the parameter represents a proportion of documents.
*        If integer, the parameter represents an absolute count.

In [52]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)

*    min_df: float in range [0.0, 1.0] or int, default=1
*        When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. (This value is also called "cut-off" in the literature.)
*        If float, the parameter represents a proportion of documents.
*        If integer, the parameter represents an absolute count.

In [53]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)

*    Guidelines for tuning CountVectorizer:
*        Use your knowledge of the problem and the text, and your understanding of the tuning parameters, to help you decide what parameters to tune and how to tune them.
*        Experiment, and let the data tell you the best approach!