In [2]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle
import socket
import re

In [3]:
#Storing the dataset into df and printing its first  values
df = pd.read_csv("Dataset/spam.csv", encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
# Removing the unnecessary columns and renaming the remaining columns
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
df = df.rename(columns={'v1': 'Type', 'v2': 'Message'}) 
df.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Storing the messages in x
x = df['Message']

# Storing the type of message in y and performing label encoding them
label = LabelEncoder()
y = label.fit_transform(df['Type'])
print(y)
label.classes_

# Creating a dataframe using x and y
df = pd.DataFrame(np.c_[x, y],columns=['Message', 'Type'])
df.head()

[0 0 1 ... 0 0 0]


Unnamed: 0,Message,Type
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
corpus = []
ps = PorterStemmer()
for i in range(0, 5572):

    # Applying Regular Expression
    
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = df['Message'][i]
    msg = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', df['Message'][i])
    msg = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', df['Message'][i])
    msg = re.sub(r'£|\$', 'moneysymb', df['Message'][i])
    msg = re.sub(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', df['Message'][i])
    msg = re.sub(r'\d+(\.\d+)?', 'numbr', df['Message'][i])
    
    # Removing all punctuations
    msg = re.sub(r'[^\w\d\s]', ' ', df['Message'][i])
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    
    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)
    
    # Lowercasing each word
    msg = msg.lower()    
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)
    
    # Tokenizing the words
    msg = msg.split()    
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)
    
    # Stemming with PorterStemmer and removing Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)

    # Preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")

    # Adding messages into corpus
    corpus.append(msg)

				 MESSAGE  0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting - Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 


				 MESSAGE  1

 After Regular Expression - Message  1  :  Ok lar    Joking wif u oni   

 Lower case Message  1  :  ok lar    joking wif u oni   

 After Splitting - Message  1  :  ['ok', 'lar', 'joking', 'wif', '

In [8]:
# Vectorising the data using Count Vectoriser
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

In [9]:
# Splitting the dataset into training data & testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [10]:
# Using Multinomial Naive Bayes Classifier to train the model
mnb_classifier = MultinomialNB()
mnb_classifier.fit(x_train, y_train)

In [11]:
# Evaluating the model using the testing data
y_pred = mnb_classifier.predict(x_test)

In [12]:
# Printing the accuracy of the trained model
print ("Accuracy : %0.5f \n\n" % accuracy_score(y_test, mnb_classifier.predict(x_test)))
print (classification_report(y_test, mnb_classifier.predict(x_test)))

Accuracy : 0.97937 


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       957
           1       0.94      0.91      0.93       158

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [13]:
# Creating a confusion matrix using the actual & predicted values
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[948   9]
 [ 14 144]]


In [14]:
# Saving the classification model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(mnb_classifier, model_file)

# Saving the Vectoriser
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(cv, vec_file)