![title](logo.jpg)

# Naive Bayes
# Spam Classification 

Import required libraries

In [316]:
import numpy as np
import pandas as pd
import nltk
import string
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [317]:
df = pd.read_table('smsspamcollection/SMSSpamCollection',header=None, names=['label', 'sms_message'])

In [318]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing

In [319]:
def isHam(x):
    if x == 'ham':
        return 0
    else:
        return 1

df['label'] = df['label'].apply(isHam)

In [320]:
X_train, X_test, Y_train, Y_test = train_test_split(df['sms_message'],  df['label'],  random_state=1)

### Using CountVectorizor from SKlearn

In [321]:
count_vector = CountVectorizer()

X_train = count_vector.fit_transform(X_train).toarray()

X_test = count_vector.transform(X_test).toarray()

In [322]:
X_train = (X_train >= 1)*1
X_test = (X_test >= 1)*1

Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

Naive Bayes Class

In [325]:
class naiveBayes():
    
    def __init__(self):
        self.phi = None
        self.p_y_i = None
        
    def CalculateprobabilityOfC(self,Y,i):
        #return the probability of class c - p(c=i)
        return np.sum((Y == i)*1)/Y.shape[0]
    
    
    
    def EstimatePhi_i(self,X,i,target):
        #estimate the value for phi for class i
        sum_i = X[X[target] == i].shape[0]
        phi_i = (np.sum(X[X[target]==i],axis=0)+1)/(2*sum_i)
        phi_i = np.array(phi_i[0:-1]).reshape(1,-1)
        return phi_i
    
    
    
    def fit(self,X,Y):
        
        target = 'labels'
        X_ = pd.DataFrame(X)
        X_[target] =  Y
        
        Classes = np.unique(Y)
        self.phi = []
        self.p_y_i = []

        for c in Classes:
            self.phi.append(self.EstimatePhi_i(X_,c,target))
            self.p_y_i.append(self.CalculateprobabilityOfC(Y,c))
        return
    
    def predict(self,X):
        
        predections = []
        N = X.shape[0]
        
        Classes = len(self.phi)
        
        for i in range(N):
            prob = []

            for c in range(Classes):
                phi_i = self.phi[c]
                
                test = X[i].reshape(1,-1)

                p = np.zeros((phi_i.shape[0],phi_i.shape[1]))

                p[test > 0] = phi_i[test > 0]
                p[test == 0] = (1-phi_i)[test == 0]
                
                
                p_x_c_i = np.sum(np.log(p)) 
                p_c_i_x = p_x_c_i + np.log(self.p_y_i[c])

                prob.append(p_c_i_x)

            predections.append(prob.index(max(prob)))  
        return predections
    
    def accuracy(self,predections,Y_true):
        accuracy = np.sum((Y_true == predections)*1)/(Y_true.shape[0])
        print ('accuracy: ' + str(round(accuracy, 3)))

In [326]:
NB = naiveBayes()

In [327]:
NB.fit(X_train,Y_train)

In [328]:
predictions = NB.predict(X_test)

In [329]:
NB.accuracy(predictions,Y_test)

accuracy: 0.988
