In [581]:
# imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize as nltk_tokenize
from collections import Counter as collections_counter
import string
from sklearn.feature_extraction.text import CountVectorizer as sk_feature_extract_text_CV
from random import randint as random_randint

Spam/Ham SMS data was obtained from the Kaggle project <br>
https://www.kaggle.com/sid321axn/sms-spam-classifier-naive-bayes-ml-algo.<br>
The data was loaded into a dataframe and the messages were extracted for cleaning.

In [582]:
# load data

_data = pd.read_csv('./spam.csv')

In [583]:
# show head

_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [584]:
def preprocess(df):

    """
        Modifies data dataframe in place.
            - to lower
            - removes stop words     
            - removes punctuation       
        Parameters:
            df (dataframe): dataframe, column 1 = labels, column 2 = message text
        Returns:
            (None)
    """

    _stopwords = set(nltk_stopwords.words('english'))

    for _m in df.iterrows():
        _m[1][1] = _m[1][1].lower()
        _m[1][1] = _m[1][1].translate(str.maketrans("","", string.punctuation))
        _m[1][1] = nltk_tokenize(_m[1][1])
        _m[1][1] = [_w for _w in _m[1][1] if not _w in _stopwords]
        _m[1][1] = ' '.join(_m[1][1])




In [585]:
# preprocess

preprocess(_data)

The data needs to be separated by spam/ham category for term/document and term frequency analysis.

In [586]:
# separate based on label

_spam = _data[_data['Category'] == 'spam']
_ham = _data[_data['Category'] == 'ham']

We implemented our bag of words by creating a term/document (sparse) matrix and a term frequency (condensed) matrix in order to conduct Baysian analysis. <br>
We used a method to create term/document matrices from the following site: https://www.kaggle.com/sid321axn/sms-spam-classifier-naive-bayes-ml-algo <br>
This method invoved using the Count Vectorizer class from sklearn to create a list of counts of words found in the data as well as a list of the words themselves. <br>
The rows of the dataframe will represent word counts for a given word. <br>
The columns will represent the words found in the messages.

In [587]:
def generate_tdm(df):
    
    """
        generates a term document matrix from input data
        Parameters:
            df (DataFrame): dataframe with column 1 = labels, column 2 = messages
        Returns:
            DataFrame: a term document matrix
    """
    
    _count_vectorizer = sk_feature_extract_text_CV()
    _count_vectorizer.fit(df.iloc[:,1])
    _features = _count_vectorizer.get_feature_names_out()
    _counts = _count_vectorizer.transform(df.iloc[:,1]).toarray()
    return pd.DataFrame(_counts, columns=_features)


In [588]:
# generate tdm for spam and ham

_tdm = generate_tdm(_data)
_spam_tdm = generate_tdm(_spam)
_ham_tdm = generate_tdm(_ham)

The frequency matrices were dataframes with one row for each word found in the dataset an two columns, <br> 
one for the number of times the word appeard and the other for the word's frequency among the messages.

In [589]:
def generate_tfm(df):
    
    """
        generates a term frequency matrix from a term document matrix
        Parameters:
            df (DataFrame): should be a term document matrix
        Returns:
            DataFrame: a term frequency matrix
    """
    _counts = []
    _probability = []
    for _label, _data in df.iteritems():
        _sum = _data.sum()
        _counts.append(_sum)
    _sum_of_all_words = sum(_counts)
    for _c in _counts:
        _probability.append((_c + 1) / (_sum_of_all_words + df.shape[1] + 1)) # shape[1] is the number of unique words in the class
    _out_data = {
        'word': df.columns,
        'count': _counts,
        'probability': _probability
    }
    return pd.DataFrame().from_dict(_out_data)


In [590]:
# generated term frequency matrix

_tfm = generate_tfm(_tdm)
_spam_tfm = generate_tfm(_spam_tdm)
_ham_tfm = generate_tfm(_ham_tdm)

In [591]:
# look at data tfm

_tfm.sort_values(by=['probability'], ascending=False)

Unnamed: 0,word,count,probability
1941,call,575,0.009763
4366,im,464,0.007881
8707,ur,390,0.006627
3737,get,386,0.006559
2913,dont,287,0.004881
...,...,...,...
4042,headstart,1,0.000034
4041,headset,1,0.000034
4035,he,1,0.000034
4034,hdd,1,0.000034


In [592]:
# look at spam tfm

_spam_tfm.sort_values(by=['probability'], ascending=False)

Unnamed: 0,word,count,probability
972,call,344,0.023339
1367,free,216,0.014680
2607,txt,150,0.010215
2644,ur,144,0.009809
1825,mobile,123,0.008389
...,...,...,...
1205,disaster,1,0.000135
1203,dirty,1,0.000135
1202,dirtiest,1,0.000135
1200,dining,1,0.000135


In [593]:
# look at ham tfm

_ham_tfm.sort_values(by=['probability'], ascending=False)

Unnamed: 0,word,count,probability
3194,im,451,0.010015
2645,get,303,0.006735
3889,ltgt,276,0.006137
4574,ok,273,0.006071
1928,dont,265,0.005894
...,...,...,...
3149,hwkeep,1,0.000044
3148,hwd,1,0.000044
3145,hut,1,0.000044
3144,hustle,1,0.000044


We needed the probabilty for each class; the probability that a given message was spam or ham.

In [594]:
def calculate_class_probability(df):

    """
        spam = #_spam/#_messages; ham = #_ham/#_messages
        Parameters:
            df (DataFrame): the data set; can be pre or post processed
        Returns: 
            (dict): {spam: (float), ham: (float)}
    """

    _count = df.shape[0]
    _p = {
        'spam': df[df['Category'] == 'spam'].count()[0] / _count,
        'ham': df[df['Category'] == 'ham'].count()[0] / _count
    }
    return _p

In [595]:
# calculate probabilites for each class

_p_classes = calculate_class_probability(_data)
print(_p_classes)

{'spam': 0.13406317300789664, 'ham': 0.8659368269921034}


In [599]:
def calculate_probability(message, p_class, class_tfm):

    """
        calculates the probability that a given message belongs to 
            the class of the class_tfm
        Parameters:
            message (string): a single message from the data set
            p_class (float): probability of class in data set
            class_tfm (DataFrame): message probability calculated
                for this class's tfm
            data_tfm (DataFrame): data set's term frequency matrix
        Returns:
            float: the probability that the message belongs to the 
                label of the supplied class_tfm
    """

    _words = message.split()
    _p = 1
    for _w in _words:
        
        # word not in class_tfm
        _row = class_tfm[class_tfm['word'] == _w]
        if _row.empty:
            _p_word = 1 / (1 + class_tfm.shape[0])  # shape[0] is the number of unique words in the class

        # word in class_tfm
        else:
            _p_word = _row['probability'].iloc[0]

        # product
        _p *= _p_word

    return _p * p_class


In [597]:
# make predictions

_predictions = []
for _row in _data.iterrows():
    print(_row[1][1])
    _p_spam = calculate_probability(_row[1][1], _p_classes['spam'], _spam_tfm)
    _p_ham = calculate_probability(_row[1][1], _p_classes['ham'], _ham_tfm)
    _prediction = 'spam'
    if _p_ham > _p_spam:
        _prediction = 'ham'
    _predictions.append(_prediction)

go jurong point crazy available bugis n great world la e buffet cine got amore wat
go: 1 * 0.0021647950209714517 = 0.0021647950209714517
jurong: 0.0021647950209714517 * 0.00034722222222222224 = 7.516649378373097e-07
point: 7.516649378373097e-07 * 0.00034722222222222224 = 2.609947700823992e-10
crazy: 2.609947700823992e-10 * 0.0004058990664321472 = 1.0593753352011872e-13
available: 1.0593753352011872e-13 * 0.00027059937762143147 = 2.866663063729366e-17
bugis: 2.866663063729366e-17 * 0.00034722222222222224 = 9.953691193504744e-21
n: 9.953691193504744e-21 * 0.00034722222222222224 = 3.456142775522481e-24
great: 3.456142775522481e-24 * 0.0008117981328642944 = 2.80569025208157e-27
world: 2.80569025208157e-27 * 0.00013529968881071573 = 3.79609018005895e-31
la: 3.79609018005895e-31 * 0.00034722222222222224 = 1.3180868680760244e-34
e: 1.3180868680760244e-34 * 0.00034722222222222224 = 4.576690514152863e-38
buffet: 4.576690514152863e-38 * 0.00034722222222222224 = 1.5891286507475217e-41
cine: 1.589

In [602]:
# compare predictions

_sum = 0
for _i,_row in enumerate(_data.iterrows()):
    if _row[1][0] != _predictions[_i]:
        continue
    _sum += 1
_score = _sum / _data.shape[0]
print(_score)

0.7681263460157932
