In [34]:
# Importing and Setting up libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

In [35]:
# Uploading and Reading the dataset
df = pd.read_csv('spam.csv', encoding='latin1')

df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [36]:
df.shape

(5572, 5)

In [37]:
#Preprocessing The Data
df.drop_duplicates(inplace=True)

In [38]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
def process_text(text):

        nopunc = [char for char in text if char not in string.punctuation]
        nopunc = ''.join(nopunc)

        clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

        return clean_words

In [41]:
# Presenting the tokenization
df['v2'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: v2, dtype: object

In [42]:
# Example messages(mails)

message4 = "hello world"
message5 = "test"
print(message4)
print()


# Converting the texts into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer
bow4 = CountVectorizer(analyzer=process_text).fit_transform([[message4], [message5]])
print(bow4)

print(bow4.shape)

hello world

  (0, 0)	1
  (0, 2)	1
  (1, 1)	1
(2, 3)


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['v2'])

In [44]:
# Split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['v1'], test_size=0.20, random_state= 0)

In [45]:
messages_bow.shape

(5169, 11304)

In [46]:
# Creating and training the Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, y_train)

In [47]:
print(classifier.predict(X_train))

print(y_train)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
3794    ham
4290    ham
2603    ham
3452    ham
3132    ham
       ... 
5307    ham
3455    ham
1708    ham
2730    ham
2871    ham
Name: v1, Length: 4135, dtype: object


In [48]:
# Evaluating the training Model

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_train, pred))
print()
print("Accuracy: ", accuracy_score(y_train, pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3631
        spam       0.98      0.98      0.98       504

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135


Confusion Matrix: 
 [[3623    8]
 [  11  493]]

Accuracy:  0.9954050785973397


In [49]:
print(classifier.predict(X_test))

print(y_test.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [50]:
# Evaluating the Testing Model

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(y_test, pred))
print()
print("Accuracy: ", accuracy_score(y_test, pred))

              precision    recall  f1-score   support

         ham       0.99      0.96      0.97       885
        spam       0.80      0.93      0.86       149

    accuracy                           0.96      1034
   macro avg       0.89      0.94      0.92      1034
weighted avg       0.96      0.96      0.96      1034


Confusion Matrix: 
 [[850  35]
 [ 11 138]]

Accuracy:  0.9555125725338491
