In [19]:
import numpy as np # for all implementations
import pandas as pd # only for data pre-processing
import math

## Data Preprocessing

In [20]:
df = pd.read_csv('sms_spam_dataset.csv' , encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [21]:
df2 = df[['v1', 'v2']]
df2 = df2.rename(columns={"v2": "text", "v1": "label"})
df2

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [22]:
df2['label'] = (df2['label'] == 'spam')
df2

Unnamed: 0,label,text
0,False,"Go until jurong point, crazy.. Available only ..."
1,False,Ok lar... Joking wif u oni...
2,True,Free entry in 2 a wkly comp to win FA Cup fina...
3,False,U dun say so early hor... U c already then say...
4,False,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,True,This is the 2nd time we have tried 2 contact u...
5568,False,Will Ì_ b going to esplanade fr home?
5569,False,"Pity, * was in mood for that. So...any other s..."
5570,False,The guy did some bitching but I acted like i'd...


In [23]:
# remove punctuations
df2['text'] = df2['text'].str.replace('[^\w\s]','')
df2

Unnamed: 0,label,text
0,False,Go until jurong point crazy Available only in ...
1,False,Ok lar Joking wif u oni
2,True,Free entry in 2 a wkly comp to win FA Cup fina...
3,False,U dun say so early hor U c already then say
4,False,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,True,This is the 2nd time we have tried 2 contact u...
5568,False,Will Ì_ b going to esplanade fr home
5569,False,Pity was in mood for that Soany other suggest...
5570,False,The guy did some bitching but I acted like id ...


In [24]:
# lowerize the strings
df2['text'] = df2['text'].apply(lambda x: " ".join(str(x).lower() for x in str(x).split()))
df2

Unnamed: 0,label,text
0,False,go until jurong point crazy available only in ...
1,False,ok lar joking wif u oni
2,True,free entry in 2 a wkly comp to win fa cup fina...
3,False,u dun say so early hor u c already then say
4,False,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,True,this is the 2nd time we have tried 2 contact u...
5568,False,will ì_ b going to esplanade fr home
5569,False,pity was in mood for that soany other suggestions
5570,False,the guy did some bitching but i acted like id ...


In [25]:
#remove special characters
df2['text'].replace(regex=True, inplace=True, to_replace= r'[^0-9a-z ]', value= r'')
df2

Unnamed: 0,label,text
0,False,go until jurong point crazy available only in ...
1,False,ok lar joking wif u oni
2,True,free entry in 2 a wkly comp to win fa cup fina...
3,False,u dun say so early hor u c already then say
4,False,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,True,this is the 2nd time we have tried 2 contact u...
5568,False,will b going to esplanade fr home
5569,False,pity was in mood for that soany other suggestions
5570,False,the guy did some bitching but i acted like id ...


In [26]:
# strip off extra spaces
df2['text'] = df2['text'].apply(lambda x: x.strip())
df2

Unnamed: 0,label,text
0,False,go until jurong point crazy available only in ...
1,False,ok lar joking wif u oni
2,True,free entry in 2 a wkly comp to win fa cup fina...
3,False,u dun say so early hor u c already then say
4,False,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,True,this is the 2nd time we have tried 2 contact u...
5568,False,will b going to esplanade fr home
5569,False,pity was in mood for that soany other suggestions
5570,False,the guy did some bitching but i acted like id ...


In [27]:
# drop duplicate entries
df2.drop_duplicates()
df2

Unnamed: 0,label,text
0,False,go until jurong point crazy available only in ...
1,False,ok lar joking wif u oni
2,True,free entry in 2 a wkly comp to win fa cup fina...
3,False,u dun say so early hor u c already then say
4,False,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,True,this is the 2nd time we have tried 2 contact u...
5568,False,will b going to esplanade fr home
5569,False,pity was in mood for that soany other suggestions
5570,False,the guy did some bitching but i acted like id ...


In [28]:
# generate word count
df2['word_count'] = df2['text'].apply(lambda x: len(str(x).split(" ")))
df2[['text','word_count']].head()
print(df2['word_count'].sum())
df2.head()

85071


Unnamed: 0,label,text,word_count
0,False,go until jurong point crazy available only in ...,20
1,False,ok lar joking wif u oni,6
2,True,free entry in 2 a wkly comp to win fa cup fina...,28
3,False,u dun say so early hor u c already then say,11
4,False,nah i dont think he goes to usf he lives aroun...,13


In [29]:
# compute the size of vocabulary
vocabulary = []
for x in df2['text']:
  vocabulary = vocabulary + x.split()
vocabulary = list(set(vocabulary))
print("Vocabulary Size:")
print(len(vocabulary))

Vocabulary Size:
9477


## Split test and train dataset

In [30]:
split_ratio = 0.75

df_train = df2.sample(frac=split_ratio,random_state=200) #random state is a seed value
df_test = df2.drop(df_train.index).sample(frac=1.0)

## Multinomial Naive Bayes Implementation

In [34]:
class MultiNomialNB(object):
    def __init__(self, df, smoothing = 1):
        spam_list = df[df['label'] == 1]
        ham_list = df[df['label'] == 0]
        
        spam_len = len(spam_list)
        ham_len = len(ham_list)
        
        self.spam_prob_dict = self.log_probs(spam_list, smoothing)
        self.ham_prob_dict = self.log_probs(ham_list, smoothing)
        self.spam_prob = self.calc_prob(spam_len, 0, spam_len + ham_len, 1)
        self.ham_prob = self.calc_prob(ham_len, 0, spam_len + ham_len, 1)
        p = math.exp(self.spam_prob)
        prob_dict = {}
        
        for key, val in self.spam_prob_dict.items():
            if key in self.ham_prob_dict.keys():
                cur_prob = p*math.exp(self.spam_prob_dict[key])
                cur_prob += (1.0 - p)*math.exp(self.ham_prob_dict[key])
                prob_dict[key] = math.log(cur_prob)

    def calc_prob(self, cur_count, alpha, total_count, V):
        numerator = cur_count + alpha + 0.000
        denominator = total_count + alpha*(V + 1.000) + 0.000
        result = math.log(numerator/ denominator)
        return result    

    def log_probs(self, df, smoothing):
        word_count = {}
        prob = {}
        for index, sms in df.iterrows():
            tokens = sms['text'].split()
            for token in tokens:
                if token not in word_count.keys():
                    word_count[token] = 1
                else:
                    word_count[token] += 1

        V = len(word_count)
        total_word_count = 0
        for word, count in word_count.items():
            total_word_count += count

        for word, count in word_count.items():
            prob[word] = self.calc_prob(word_count[word], smoothing,
            total_word_count, V)
        prob["<UNK>"] = self.calc_prob(0, smoothing,
            total_word_count, V)

        return prob

    def is_spam(self, sms_data):
        tokens = sms_data['text'].split()
        p_spam = self.spam_prob
        p_not_spam = self.ham_prob

        for word in tokens:
            if word in self.spam_prob_dict.keys():
                p_spam += self.spam_prob_dict[word]
            else:
                p_spam += self.spam_prob_dict["<UNK>"]
        
        for word in tokens:
            if word in self.ham_prob_dict.keys():
                p_not_spam += self.ham_prob_dict[word]
            else:
                p_not_spam += self.ham_prob_dict["<UNK>"]

        if p_spam >= p_not_spam:
            return True
        return False

    def accuracy(self, df):
        tot = len(df)
        freq = 0
        for index, sms in df.iterrows():
            if self.is_spam(sms) == sms['label']:
                freq += 1
        return freq * 100 / tot

In [42]:
spam_filter = MultiNomialNB(df_train, smoothing = 0.5)
print(f"Accuracy : {spam_filter.accuracy(df_test)}%")

Accuracy : 96.76956209619526%


## Multi-variate Bernoulli Naive Bayes

In [36]:
class MultiVariateBernoulliNB(object):
    def __init__(self, df, smoothing = 1):
        spam_list = df[df['label'] == 1]
        ham_list = df[df['label'] == 0]
        
        spam_len = len(spam_list)
        ham_len = len(ham_list)
        
        self.spam_prob_dict = self.log_probs(spam_list, smoothing)
        self.ham_prob_dict = self.log_probs(ham_list, smoothing)
        self.spam_prob = self.calc_prob(spam_len, 0, spam_len + ham_len, 1)
        self.ham_prob = self.calc_prob(ham_len, 0, spam_len + ham_len, 1)
        p = math.exp(self.spam_prob)
        prob_dict = {}
        
        for key, val in self.spam_prob_dict.items():
            if key in self.ham_prob_dict.keys():
                cur_prob = p*math.exp(self.spam_prob_dict[key])
                cur_prob += (1.0 - p)*math.exp(self.ham_prob_dict[key])
                prob_dict[key] = math.log(cur_prob)

    def calc_prob(self, cur_count, alpha, total_count, V):
        numerator = cur_count + alpha + 0.000
        denominator = total_count + alpha*(V + 1.000) + 0.000
        result = math.log(numerator/ denominator)
        return result    

    def log_probs(self, df, smoothing):
        word_count = {}
        prob = {}
        for index, sms in df.iterrows():
            tokens = sms['text'].split()
            tokens = set(tokens)
            for token in tokens:
                if token not in word_count.keys():
                    word_count[token] = 1
                else:
                    word_count[token] += 1

        V = len(word_count)
        total_word_count = 0
        for word, count in word_count.items():
            total_word_count += count

        for word, count in word_count.items():
            prob[word] = self.calc_prob(word_count[word], smoothing,
            total_word_count, V)
        prob["<UNK>"] = self.calc_prob(0, smoothing,
            total_word_count, V)

        return prob

    def is_spam(self, sms_data):
        tokens = sms_data['text'].split()
        tokens = set(tokens)
        p_spam = self.spam_prob
        p_not_spam = self.ham_prob

        for word in tokens:
            if word in self.spam_prob_dict.keys():
                p_spam += self.spam_prob_dict[word]
            else:
                p_spam += self.spam_prob_dict["<UNK>"]
        
        for word in tokens:
            if word in self.ham_prob_dict.keys():
                p_not_spam += self.ham_prob_dict[word]
            else:
                p_not_spam += self.ham_prob_dict["<UNK>"]

        if p_spam >= p_not_spam:
            return True
        return False

    def accuracy(self, df):
        tot = len(df)
        freq = 0
        for index, sms in df.iterrows():
            if self.is_spam(sms) == sms['label']:
                freq += 1
        return freq * 100 / tot

In [38]:
spam_filter = MultiVariateBernoulliNB(df_train, smoothing = 1)
print(f"Accuracy : {spam_filter.accuracy(df_test)}%")

Accuracy : 95.83632447954056%


## Gaussian Naive Bayes

In [31]:
def convertDataframeToMatrix(df):
  X = np.zeros((len(df), len(vocabulary)))
  for i in range(X.shape[0]):
    for j in range(X.shape[1]):
      if vocabulary[j] in df.iloc[i].text:
        X[i][j] += 1
  return X

In [32]:
class GaussianNB:
  phi = None
  mu = None
  E_inv = None
  def fit(self, X, Y):
    self.phi = Y.mean()
    
    self.mu = np.zeros((2, X.shape[1]))
    for i in range(2):
      self.mu[i] = X[Y == i].mean(axis = 0)
    
    X_central = X.copy()
    for i in range(2):
      X_central[Y == i] =  X_central[Y == i] - self.mu[i]
    E = np.dot(X_central.T, X_central) / (X.shape[0])
    self.E_inv = np.linalg.pinv(E)
    
  def getProbability(self, X, y):
    p_y = np.power(self.phi, y) * np.power(1 - self.phi, 1 - y)
    res = np.exp(-1 * np.sum( np.dot((X - self.mu[y]), self.E_inv)*(X - self.mu[y]) , axis = 1))
    res = res * p_y
    return res

  def predict(self, X):
    prob = np.zeros((2, X.shape[0]))
    for i in range(2):
      prob[i] = self.getProbability(X, i)

    return np.argmax(prob, axis = 0)
  
  def getAccuracy(self, X, Y):
    return (self.predict(X) == Y).mean()

In [33]:
# GDA_model = GaussianNB()
# GDA_model.fit(X_train,Y_train)
# print(f"Accuracy : { round(GDA_model.getAccuracy(X_test,Y_test) * 100, 2) }%")

## Analysis

Comparison and results can be found in "analysis.pdf"