In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


In [2]:

data=pd.read_csv('spam_clean.csv',encoding='latin-1')
data.head()

Unnamed: 0,type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# cleaning messages

In [3]:
import nltk 
import re
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\SONY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SONY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def clean_message(sentence):
  words=nltk.word_tokenize(sentence)
  cleaned_msg=""
  for word in words:
    word=word.lower()
    word=re.sub(r'[^a-z]','',word)
    if word!="" and word not in stopwords.words('english'):
      cleaned_msg=cleaned_msg+" "+word
  return cleaned_msg.strip()
  
    

In [5]:
data['cleaned_msg']=data['message'].apply(clean_message)
data.head()

Unnamed: 0,type,message,cleaned_msg
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah nt think goes usf lives around though


In [6]:
data.shape

(5572, 3)

# data preparation for naive bayes

In [58]:
class NaiveBayes:
    def __init__(self):
        self.common_words=[]

    
    def fit(self,X,y,common_words=10):
        self.X=X
        self.y=y

        self.prior_prob=self.y.value_counts(normalize=True)
        ham=""
        spam=""
        ham=ham.join(self.X[self.y=='ham'])
        ham_words=ham.split()
        spam=spam.join(self.X[self.y=='spam'])
        spam_words=spam.split()

        ham_word_count=np.array(Counter(ham_words).most_common(common_words))[:,0]
        spam_word_count=np.array(Counter(spam_words).most_common(common_words))[:,0]

        self.features=ham_word_count
        self.features=np.append(self.features,spam_word_count)
        self.features=np.unique(self.features)
        

        count_matrix=pd.DataFrame(np.zeros((self.X.shape[0],self.features.shape[0])))
        count_matrix.columns=self.features

        self.word_frequency_matrix= pd.concat([self.X,count_matrix],axis=1)
        self.word_count()
        self.word_frequency_matrix.drop(columns='cleaned_msg',inplace=True)

        self.word_prob=pd.DataFrame(np.sum(self.word_frequency_matrix)/np.sum(self.word_frequency_matrix).sum(),columns=["total"])
        self.word_prob["ham"]=np.sum(self.word_frequency_matrix[self.y=="ham"])/np.sum(self.word_frequency_matrix[self.y=="ham"]).sum()
        self.word_prob["spam"]=np.sum(self.word_frequency_matrix[self.y=="spam"])/np.sum(self.word_frequency_matrix[self.y=="spam"]).sum()

    
    def predict(self,query):
        # Assuming query is a clean message
        words=self.give_me_words_from_features(query)
        spam_prob=(np.prod(self.word_prob.loc[words,"spam"])*self.prior_prob["spam"]+0.00001)/(np.prod(self.word_prob.loc[words,"total"])+2*0.00001)
        ham_prob=(np.prod(self.word_prob.loc[words,"ham"])*self.prior_prob["ham"]+0.00001)/(np.prod(self.word_prob.loc[words,"total"])+2*0.00001)

        
        return (spam_prob,ham_prob)
        
        
        
    
        
    
    def word_count(self):
        for idx,row in enumerate(self.word_frequency_matrix['cleaned_msg']):
            word_count=Counter(self.give_me_words_from_features(row))
            self.word_frequency_matrix.loc[idx,word_count.keys()]=word_count.values()

    def give_me_words_from_features(self,sentence):
        return [word for word in sentence.split() if word in self.features]

In [59]:
NB=NaiveBayes()
NB.fit(data['cleaned_msg'],data['type'])
print(NB.predict("Hi this is a free call , claim ur lt"))

(0.48940822520768307, 0.48940822520768307)


  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


In [54]:
word_dist

Unnamed: 0,0,ham,spam
call,0.105082,0.061534,0.197183
claim,0.020438,0.0,0.063662
free,0.050642,0.015717,0.124507
get,0.070718,0.081779,0.047324
go,0.051185,0.066596,0.018592
got,0.045578,0.065264,0.003944
gt,0.057515,0.08471,0.0
like,0.044131,0.061534,0.007324
lt,0.057153,0.084177,0.0
mobile,0.02514,0.003996,0.069859
