Importing useful Packages

In [1]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import string

In [2]:
path="https://raw.githubusercontent.com/Ashutoshrx/Natural-Language-Processing/main/data/spam.csv"

In [3]:
df = pd.read_csv(path,encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

Checking for duplicates and removing them

In [6]:
df.drop_duplicates(inplace = True)

Show the new shape (number of rows & columns)

In [7]:
df.shape

(5169, 5)

Show the number of missing (NAN, NaN, na) data for each column

In [8]:

df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64

Need to download stopwords

In [9]:

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**
1.Tokenization (a list of tokens), will be used as the analyzer

2.Punctuations are [!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]

3.Stop words in natural language processing, are useless words (data). 
**

In [10]:

def process_text(text):
    
    #1 Remove Punctuationa
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #2 Remove Stop Words
    clean_words = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    #3 Return a list of clean words
    return clean_words

In [11]:
#Show the Tokenization (a list of tokens )
df['v2'].head().apply(process_text)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: v2, dtype: object

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
messages_bow = CountVectorizer(analyzer=process_text).fit_transform(df['v2'])

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages_bow, df['v1'], test_size = 0.2, random_state = 0)

In [14]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape,messages_bow.shape

((4135, 11304), (1034, 11304), (4135,), (1034,), (5169, 11304))

In [15]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
#Print the predictions
print(classifier.predict(X_train))
#Print the actual values
print(y_train.values)

['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [17]:
#Evaluate the model on the training data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_train)
print(classification_report(y_train ,pred ))
print('Confusion Matrix: \n',confusion_matrix(y_train,pred))
print()
print('Accuracy for training data: ', accuracy_score(y_train,pred))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3631
        spam       0.98      0.98      0.98       504

    accuracy                           1.00      4135
   macro avg       0.99      0.99      0.99      4135
weighted avg       1.00      1.00      1.00      4135

Confusion Matrix: 
 [[3623    8]
 [  11  493]]

Accuracy for training data:  0.9954050785973397


In [18]:
#Print the predictions
print('Predicted value: ',classifier.predict(X_test))
#Print Actual Label
print('Actual value: ',y_test.values)

Predicted value:  ['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']
Actual value:  ['ham' 'ham' 'ham' ... 'ham' 'ham' 'ham']


In [26]:
#Evaluate the model on the test data set
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
pred = classifier.predict(X_test)
print(classification_report(y_test ,pred ))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print()
print('Naive Bayes accuracy score in %: ', accuracy_score(y_test,pred)*100)

              precision    recall  f1-score   support

         ham       0.99      0.96      0.97       885
        spam       0.80      0.93      0.86       149

    accuracy                           0.96      1034
   macro avg       0.89      0.94      0.92      1034
weighted avg       0.96      0.96      0.96      1034

Confusion Matrix: 
 [[850  35]
 [ 11 138]]

Naive Bayes accuracy score in %:  95.55125725338492
