In [16]:
# imports
import pandas as pd
import numpy as np
from nltk.corpus import stopwords as nltk_stopwords
from nltk.tokenize import word_tokenize as nltk_tokenize
from collections import Counter as collections_counter
import string
from sklearn.feature_extraction.text import CountVectorizer as sk_feature_extract_text_CV
from random import randint as random_randint

Spam/Ham SMS data was obtained from the Kaggle project <br>
https://www.kaggle.com/sid321axn/sms-spam-classifier-naive-bayes-ml-algo.<br>
The data was loaded into a dataframe and the messages were extracted for cleaning.

In [17]:
# load data
_data = pd.read_csv('./spam.csv')

In [18]:
# show head

_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
def preprocess(df):

    """
        Modifies data dataframe in place.
            - to lower
            - removes stop words            
        Parameters:
            df (dataframe): dataframe, column 1 = labels, column 2 = message text
        Returns:
            (None)
    """

    for _m in df.iterrows():
        _m[1][1] = _m[1][1].lower()
        _m[1][1] = _m[1][1].translate(str.maketrans("","", string.punctuation))
        print(_m[1][1])
        return




In [20]:
preprocess(_data)

(0, Category                                                  ham
Message     go until jurong point crazy available only in ...
Name: 0, dtype: object)


In [160]:
# tokenize

for _m in _messages.iterrows():
    _m[1][0] = nltk_tokenize(_m[1][0])

In [161]:
# check tokenization

_messages.head()

Unnamed: 0,messages
0,"[go, until, jurong, point, crazy, available, o..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,"[u, dun, say, so, early, hor, u, c, already, t..."
4,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [162]:
# remove stopwords

_stopwords = set(nltk_stopwords.words('english'))
for _m in _messages.iterrows():
    _m[1][0] = [_w for _w in _m[1][0] if not _w in _stopwords]

In [163]:
# check stop word removal

_messages.head()

Unnamed: 0,messages
0,"[go, jurong, point, crazy, available, bugis, n..."
1,"[ok, lar, joking, wif, u, oni]"
2,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,"[u, dun, say, early, hor, u, c, already, say]"
4,"[nah, dont, think, goes, usf, lives, around, t..."


We implemented our bag of words by creating a term/document (sparse) matrix and a term frequency (condensed) matrix in order to conduct Baysian analysis. <br>
We used a method to create term/document matrices from the following site: https://www.kaggle.com/sid321axn/sms-spam-classifier-naive-bayes-ml-algo <br>
This method invoved using the Count Vectorizer class from sklearn to create a list of counts of words found in the data as well as a list of the words themselves. <br>
The rows of the dataframe will represent word counts for a given word. <br>
The columns will represent the words found in the messages.

In [164]:
# reformat data frame from lists to strings

_message_list = []
for _m in _messages.iterrows():
    _message_list.append(' '.join(_m[1][0]))
_messages = pd.DataFrame({'messages': _message_list})

In [165]:
# count vectorizer can create a term document matrix from a dataframe of strings

_count_vectorizer = sk_feature_extract_text_CV()
_count_vectorizer.fit(_messages_as_list)


CountVectorizer()

In [167]:
# get features and count listt

_features = _count_vectorizer.get_feature_names_out()
_counts = _count_vectorizer.transform(_messages_as_list).toarray()

In [168]:
# display some features

for _ in range(5):
    _p = random_randint(0, len(_features))
    print(_features[_p])

kallis
45
wotz
worldvery
listen


In [170]:
# show some counts 

for _ in range(5):
    _p = random_randint(0, len(_counts))
    print(_counts[_p])

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [171]:
# create term/document matrix dataframe

_tdm = pd.DataFrame(_counts, columns=_features)

In [172]:
# show tdm head

_tdm.head()

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [173]:
# check tdm dimensions

_tdm.shape

(5572, 9426)

The frequency matrices were dataframes with one row for each word found in the dataset an two columns, <br> 
one for the number of times the word appeard and the other for the word's frequency among the messages.

In [214]:
def make_term_frequncy_matrix(df):
    
    """
        
    """

    _row_count = df.shape[0]
    _counts = []
    _frequencies = []
    for _label, _data in df.iteritems():
        _counts.append(_data.sum())
        _frequencies.append(1 - _data.value_counts()[0] / _row_count)
    _out_data = {
        'terms': df.columns,
        'count': _counts,
        'frequency': _frequencies
    }
    return pd.DataFrame().from_dict(_out_data)


In [215]:
# generated term frequency matrix

_tfm = make_term_frequncy_matrix(_tdm)

In [216]:
# look at term frequency matrix

_tfm.head()

Unnamed: 0,terms,count,frequency
0,008704050406,2,0.000359
1,0089my,1,0.000179
2,0121,1,0.000179
3,01223585236,1,0.000179
4,01223585334,2,0.000359
