# Naive Bayesian

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
spam = pd.read_csv('./Dataset/spam.csv', encoding='latin-1')

spam.shape

(5572, 5)

In [3]:
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
spam.duplicated().sum()

403

In [5]:
spam.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [6]:
spam['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [7]:
c = ' '.join(spam.loc[spam['v1'] == 'ham', 'v2'])

l = c.split(' ')
l[:10]

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n']

In [8]:
from collections import Counter

In [9]:
conter1 = Counter(l)

In [10]:
conter1.most_common(20)

[('to', 1530),
 ('you', 1458),
 ('I', 1436),
 ('the', 1019),
 ('a', 969),
 ('and', 738),
 ('i', 736),
 ('in', 734),
 ('u', 645),
 ('is', 638),
 ('my', 619),
 ('', 597),
 ('me', 537),
 ('of', 498),
 ('for', 475),
 ('that', 398),
 ('it', 375),
 ('your', 373),
 ('on', 352),
 ('have', 346)]

In [11]:
df1 = pd.DataFrame(list(conter1.items()))

df1.shape

(12480, 2)

In [12]:
df1.head()

Unnamed: 0,0,1
0,Go,10
1,until,21
2,jurong,1
3,"point,",1
4,crazy..,1


In [13]:
df1.columns = ['words in non-spam', 'count']
df1.head()

Unnamed: 0,words in non-spam,count
0,Go,10
1,until,21
2,jurong,1
3,"point,",1
4,crazy..,1


In [14]:
c = ' '.join(spam.loc[spam['v1'] == 'spam', 'v2'])

l = c.split(' ')
l[:10]

['Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA']

In [15]:
conter2 = Counter(l)

conter2.most_common(20)

[('to', 604),
 ('a', 358),
 ('your', 187),
 ('call', 185),
 ('or', 185),
 ('the', 178),
 ('2', 169),
 ('for', 169),
 ('you', 164),
 ('is', 143),
 ('Call', 136),
 ('on', 135),
 ('have', 128),
 ('and', 119),
 ('from', 116),
 ('ur', 107),
 ('with', 101),
 ('&', 98),
 ('4', 93),
 ('of', 93)]

In [16]:
df2 = pd.DataFrame(list(conter2.items()))
df2.shape

(4313, 2)

In [17]:
df2.head()

Unnamed: 0,0,1
0,Free,35
1,entry,25
2,in,64
3,2,169
4,a,358


In [18]:
df2.columns = ['word in spam', 'count']
df2.head()

Unnamed: 0,word in spam,count
0,Free,35
1,entry,25
2,in,64
3,2,169
4,a,358


In [19]:
from sklearn import feature_extraction

In [22]:
fe = feature_extraction.text.CountVectorizer(stop_words='english')

In [25]:
X = fe.fit_transform(spam['v2'])
X.shape

(5572, 8404)

In [29]:
y = spam['v1'].map({'spam' : 1, 'ham' : 0})
y

0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: v1, Length: 5572, dtype: int64

## train test split

In [30]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2022, stratify=y)
X_train.shape, X_test.shape

((3900, 8404), (1672, 8404))

In [33]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
clf = MultinomialNB()

clf.fit(X_train, y_train)

In [36]:
y_pred = clf.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score, precision_score

In [38]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9808612440191388
0.9033613445378151


In [39]:
from sklearn import svm

In [40]:
svc = svm.SVC()

svc.fit(X_train, y_train)

In [41]:
y_pred = svc.predict(X_test)

In [43]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.9766746411483254
1.0
