In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('sms_spam_dataset.csv' , encoding='latin-1')
df2 = df[['v1', 'v2']]
df2 = df2.rename(columns={"v2": "text", "v1": "label"})
df2

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# remove punctuations
df2['text'] = df2['text'].str.replace('[^\w\s]','')
df2

Unnamed: 0,label,text
0,ham,Go until jurong point crazy Available only in ...
1,ham,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor U c already then say
4,ham,Nah I dont think he goes to usf he lives aroun...
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home
5569,ham,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like id ...


In [4]:
df2['text'] = df2['text'].apply(lambda x: " ".join(str(x).lower() for x in str(x).split()))
df2

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will ì_ b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [5]:
#remove special characters
df2['text'].replace(regex=True, inplace=True, to_replace= r'[^0-9a-z ]', value= r'')
df2

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [6]:
df2['text'] = df2['text'].apply(lambda x: x.strip())
df2

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [7]:
# drop duplicate entries
df2.drop_duplicates()
df2

Unnamed: 0,label,text
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [8]:
df2['word_count'] = df2['text'].apply(lambda x: len(str(x).split(" ")))
df2[['text','word_count']].head()
print(df2['word_count'].sum())
df2.head()

85071


Unnamed: 0,label,text,word_count
0,ham,go until jurong point crazy available only in ...,20
1,ham,ok lar joking wif u oni,6
2,spam,free entry in 2 a wkly comp to win fa cup fina...,28
3,ham,u dun say so early hor u c already then say,11
4,ham,nah i dont think he goes to usf he lives aroun...,13


In [9]:
vocabulary = []
for x in df2['text']:
  vocabulary = vocabulary + x.split()
vocabulary = list(set(vocabulary))
print("Vocabulary Size:")
print(len(vocabulary))

Vocabulary Size:
9477


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(df2.text,df2.label,train_size=0.75)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train).toarray()
X_test_transformed = vectorizer.transform(X_test).toarray()

In [12]:
class GausianNB(object):
  phi = None
  mu = None
  E_inv = None
  def fit(self, X, Y):
    self.phi = Y.mean()
    
    self.mu = np.zeros((2, X.shape[1]))
    for i in range(2):
      self.mu[i] = X[Y == i].mean(axis = 0)
    
    X_central = X.copy()
    for i in range(2):
      X_central[Y == i] =  X_central[Y == i] - self.mu[i]
    E = np.dot(X_central.T, X_central) / (X.shape[0])
    self.E_inv = np.linalg.pinv(E)
    
  def getProbability(self, X, y):
    p_y = np.power(self.phi, y) * np.power(1 - self.phi, 1 - y)
    res = np.exp(-1 * np.sum( np.dot((X - self.mu[y]), self.E_inv)*(X - self.mu[y]) , axis = 1))
    res = res * p_y
    return res

  def predict(self, X):
    prob = np.zeros((2, X.shape[0]))
    for i in range(2):
      prob[i] = self.getProbability(X, i)

    return np.argmax(prob, axis = 0)
  
  def accuracy_score(self, X, Y):
    return (self.predict(X) == Y).mean()
from sklearn.naive_bayes import GaussianNB

In [14]:
from sklearn.metrics import accuracy_score
spam_classifier_model = GaussianNB()

spam_classifier_model.fit(X_train_transformed,y_train)
y_predict = spam_classifier_model.predict(X_test_transformed)
print(f'Accuracy : {accuracy_score(y_test,y_predict) * 100}%')

Accuracy : 90.73941134242641%
