<br><br>
<b><font size=100>Spam Detection - SMS</font></b>
<br>
<br>
By अंkur गोswami
____

In [154]:
import numpy as np
import pandas as pd
import string
import nltk
import os
import shelve as sl

# Data Acquisition:

In [155]:
dataset=pd.read_csv('./data/SMSSpamCollection',sep='\t',header=None,names=['label','message'])

In [156]:
dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [157]:
y=dataset.iloc[:,0]

# Message Preprocessing Functions:

In [158]:
#Function to tokenize each & every word.
#For example: On tokenizing "wouldn't" we get "would" & "n't".

def tokenizer(message):
    tokenized=nltk.word_tokenize(message)
    tokenized=' '.join(tokenized)
    tokenized=tokenized.replace('n\'t','not')
    tokenized=tokenized.replace('ai','ain')
    return tokenized

In [159]:
#Function to remove punctuation.

def punc_removal(message):
    no_punc=[i for i in message if i not in string.punctuation]
    no_punc=''.join(no_punc)
    return no_punc

In [160]:
#Function to remove stopwords.
#Stopwords are words that cannot help in classifying statements on their own.
#For example: "is","am","the",etc.

def stopwords_removal(message):
    no_stopwords=[i for i in message.lower().split() if i not in nltk.corpus.stopwords.words('english')]
    return no_stopwords

In [161]:
#Function to convert words into single form.
#Such as:
#       1. Converting past and past continuous tense to present tense.
#       2. Converting plural form into singular form.

def stemming(List):
    stem_obj=nltk.stem.PorterStemmer()
    List=[stem_obj.stem(i) for i in List]
    message=' '.join(List)
    return message

In [162]:
#Function to compile every message operation.

def message_operations(message):
    return stemming(stopwords_removal(punc_removal(tokenizer(message))))

# Data Munging:

In [163]:
#Preprocessing each & every message.
X=dataset['message'].apply(message_operations).values

## Vectorizing words:

In [164]:
from sklearn.feature_extraction.text import CountVectorizer

In [165]:
word_vectorize=CountVectorizer()      #Instantiating CountVectorizer object.
X=word_vectorize.fit_transform(X)     #Fitting data.
X=X.toarray()                         #Converting the sparse matrix into an object.

In [166]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Providing weights to frequent words: 

In [167]:
from sklearn.feature_extraction.text import TfidfTransformer

In [168]:
tfidf=TfidfTransformer()      #Instantiating TfidfTransformer object.
X=tfidf.fit_transform(X)      #Fitting data.
X=X.toarray()                 #Converting the sparse matrix into an object.

In [169]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Label encoding:

In [170]:
from sklearn.preprocessing import LabelEncoder

In [171]:
label_encode=LabelEncoder()      #Instantiating LabelEncoder object.
y=label_encode.fit_transform(y)  #Fitting & Tranforming data.

# Spiltting data into training & testing sets:

In [172]:
from sklearn.model_selection import train_test_split

In [173]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=0)

# Feeding data to ML algorithm:

In [174]:
from sklearn.naive_bayes import MultinomialNB

In [175]:
nb=MultinomialNB()         #Instantiating MultinomialNB object.
nb.fit(X_train,y_train)    #Fitting & Tranforming data.

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Getting Scores:

In [176]:
print('Test Score:',nb.score(X_test,y_test))    #Test Score.
print('Train Score:',nb.score(X_train,y_train))  #Train Score.
print('Total Score:',nb.score(X,y))              #Total Score.

Test Score: 0.965742251223491
Train Score: 0.9721403696758639
Total Score: 0.9700287150035893


# Classification Report:

In [177]:
from sklearn.metrics import classification_report

In [178]:
print(classification_report(y_test,nb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98      1597
          1       1.00      0.74      0.85       242

avg / total       0.97      0.97      0.96      1839



The scores & classification report are exceptionally well. So, the model is now ready to perform on other data as well.

# Training model with whole data:

In [179]:
nb.fit(X,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Creating Persistent object:

In [180]:
#Saving object state in "nb_obj" file.
with sl.open('./persist/nb_obj',writeback=True) as persist_obj:
    persist_obj['classifier']=nb

# Spam Detection Pipeline:

In [181]:
def pipeline(message):
    with sl.open('./persist/nb_obj',writeback=True) as persist_obj:
        obj=persist_obj['classifier']
        
        ###############################################
        ###############################################
        message=message_operations(message)
        message=word_vectorize.transform([message])
        message=message.toarray()
        message=tfidf.transform(message)
        message=message.toarray()
        result=obj.predict(message)
        ###############################################
        ###############################################
        
        return label_encode.inverse_transform(result) #Reversing label encoding & returning it.

# Examples:

In [182]:
SMS='Offer valid of selected merchandise.'
print(*pipeline(SMS))

spam


  if diff:


In [183]:
SMS='Best of luck.'
print(*pipeline(SMS))

ham


  if diff:


____