In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#download nltk's 'stopwords' for removal in pre-process
import nltk
nltk.download('stopwords')
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.model_selection import train_test_split,cross_validate
#next step is to pre-process text data 
#firstly import useful functions
import string
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cvveljanovski\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv(r"C:\\Users\\cvveljanovski\\Desktop\\Exercising Naive Bayes\\sms_spam.csv")

In [3]:
data.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [21]:
df = data.copy()

In [22]:
#    define a function to:
#    1. Remove punctuations
#    2. tokenize the terms in each text message
#    3. Remove stopwords

def text_process(content):
    removepunc=[word for word in content if word not in string.punctuation]
    removepunc=''.join(removepunc)
    
    return[term for term in removepunc.split() if term.lower() not in stopwords.words('english')]

In [23]:
df['text'].head(5).apply(text_process) #check if defined function works 

0                         [Hope, good, week, checking]
1                                [Kgive, back, thanks]
2                                     [also, cbe, pay]
3    [complimentary, 4, STAR, Ibiza, Holiday, £1000...
4    [okmail, Dear, Dave, final, notice, collect, 4...
Name: text, dtype: object

In [24]:
#Next step is to vectorize each term and weight it by tf-idf model
#firstly import cpuntvectorizer to measure the frequency of each word term

from sklearn.feature_extraction.text import CountVectorizer 
bow_process=CountVectorizer(analyzer=text_process).fit(df['text'])

print (len(bow_process.vocabulary_)) #check the number of terms (vectors)

11355


In [25]:
#transform vectors to term-document incidence matrix
df_bow= bow_process.transform(df['text'])

In [26]:
#weight vectors by tf-idf model
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_trans=TfidfTransformer().fit(df_bow)
df_tfidf= tfidf_trans.transform(df_bow)

print(df_tfidf.shape) #check the size of weighted term-document incidence matrix

(5559, 11355)


In [27]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes_model=MultinomialNB().fit(df_tfidf, df['type']) #train Naive Bayes classifier

In [28]:
df.head()

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...


In [29]:
df['text'].head(5).apply(text_process)

0                         [Hope, good, week, checking]
1                                [Kgive, back, thanks]
2                                     [also, cbe, pay]
3    [complimentary, 4, STAR, Ibiza, Holiday, £1000...
4    [okmail, Dear, Dave, final, notice, collect, 4...
Name: text, dtype: object

In [30]:
#transform vectors to term-document incidence matrix for the test set
df_bow= bow_process.transform(df['text'])
df_tfidf=tfidf_trans.transform(df_bow) 
#now we have vectorized test data that can be classified by three models

print('Shape :', df_bow.shape)

Shape : (5559, 11355)


In [31]:
from sklearn.metrics import classification_report
NB_predict = naive_bayes_model.predict(df_tfidf)    # test the Naive Bayes model and get prediction

In [32]:
print(classification_report(df['type'],NB_predict))    # generate evaluation report of NB model

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      4812
        spam       1.00      0.85      0.92       747

   micro avg       0.98      0.98      0.98      5559
   macro avg       0.99      0.92      0.95      5559
weighted avg       0.98      0.98      0.98      5559



In [33]:
#create pipelines for the model to systematically pre-process text data based on our previous pre-processing steps
#to store pipelines of workfolow
#for further study use

from sklearn.pipeline import Pipeline
NB_classifier = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),   #vectorize terms within text data sets
    ('tfidf', TfidfTransformer()),  #weight terms
    ('classifier', MultinomialNB()),   #implement Naive Bayes classifier
])

In [None]:
NB_classifier.fit(df['text'],df['type'])  #train the model by fitting training data sets
NB_prediction=NB_classifier.predict(df['text'])  #test the model and get prediction
print(classification_report(df['type'],NB_prediction))  #create evaluation report