In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
from imblearn.over_sampling import SMOTE 
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA,TruncatedSVD
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import string
import matplotlib.pyplot as plt
%matplotlib inline 


Using TensorFlow backend.


<div class="alert alert-block alert-info">
<b>Loading dataset:</b> We load the dataset and rename certain columns to be used in our analysis
</div>

In [2]:
path_of_input_file = 'D:\\kaggle_trials\\sms-spam-collection-dataset\\spam.csv'
df                 = pd.read_csv(path_of_input_file,encoding='ISO-8859-1')
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df.columns = ['labels', 'data']
df.head(4)

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


<div class="alert alert-block alert-info">
<b>Imbalance check:</b> We can clearly see that the data is imbalanced because there will be more usual mails than spam mails.

In [3]:
num_labels = df['labels'].unique()
print('The number of labels are ',len(num_labels))

The number of labels are  2


In [4]:
for i in range(len(num_labels)):
    print('The number of ', num_labels[i] ,' labels are :- ',len(df[df['labels']==num_labels[i]]))
print('We dont have a balanced dataset and hence we need to perform imbalanced dataset handling')

The number of  ham  labels are :-  4825
The number of  spam  labels are :-  747
We dont have a balanced dataset and hence we need to perform imbalanced dataset handling


<div class="alert alert-block alert-info">
<b>Label Binarizing:</b> We binarize the labels to integers making it easy to feed into the model
</div>

In [5]:
lb           = LabelBinarizer()
Y            = lb.fit_transform(df['labels'].values)


<div class="alert alert-block alert-info">
<b>Text Preprocessing:</b> We preprocess the text data by removing punctuations and converting every word to lowercase. Also we create a feature matrix X by using Tf-Idf vectorizer. We used Tf-idf because there are some words people use in usual sms conversations that may not have any word embeddings associated with them
</div>

In [6]:

def preprocess_text(statement):
    punc_removed_statement = "".join(l for l in statement if l not in string.punctuation)
    splitting2words        = punc_removed_statement.split()
    lower_cased_statement  = " ".join(word.lower() for word in splitting2words)
    return lower_cased_statement

In [7]:
df['preprocessed_data']= df['data'].apply(preprocess_text)

In [8]:
tfidf = TfidfVectorizer(decode_error='ignore')
X = tfidf.fit_transform(df['preprocessed_data'])


<div class="alert alert-block alert-info">
<b>Removing Imbalance :</b> Our data is balanced now after applying SMOTE
</div>

In [10]:
sm           = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X, Y)

In [11]:
print('Positive examples before Oversampling is ', sum(Y == [1])[0])
print('Negative examples before Oversampling is ', sum(Y == [0])[0])
print('\n')
print('Positive examples after Oversampling is ', sum(Y_res == [1]))
print('Negative examples after Oversampling is ', sum(Y_res == [0]))
print('\n')

Positive examples before Oversampling is  747
Negative examples before Oversampling is  4825


Positive examples after Oversampling is  4825
Negative examples after Oversampling is  4825




<div class="alert alert-block alert-info">
<b>Train test split:</b> We create the train test split of the data
</div>

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_res, Y_res, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Hyper parameter grid:</b> We now set the grid for tuning the hyper parameters associated with the model.
</div>

In [13]:
multinomial_grid = {'alpha' : hp.uniform('alpha',0.5,5),
                   'fit_prior'     : hp.choice('fit_prior',[True,False])}

In [14]:
def hyperopt_train_test(params):
    clf = MultinomialNB(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, multinomial_grid, algo=tpe.suggest, max_evals=500, trials=trials)
best_parameters = space_eval(multinomial_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|████████████████████████████████████████████████| 500/500 [00:06<00:00, 75.32it/s, best loss: -0.9862334199996453]
The best parameter tuned on training set is given by :-  {'alpha': 0.5031703617001609, 'fit_prior': False}


<div class="alert alert-block alert-info">
<b>Final Results and Model fitting:</b> We finally fit the model with the tuned hyper parameters and present a classification report as our analysis 
</div>

In [15]:
model = MultinomialNB(**best_parameters)
model.fit(X_train, y_train)

MultinomialNB(alpha=0.5031703617001609, class_prior=None, fit_prior=False)

In [16]:
y_pred = model.predict(X_test)

In [17]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1614
           1       0.99      0.99      0.99      1571

    accuracy                           0.99      3185
   macro avg       0.99      0.99      0.99      3185
weighted avg       0.99      0.99      0.99      3185

