# Problem Statement.
Spam filtering using Naive bayes classifiers in order to predict whether a new mail based ion its content, can be categorized as spam or not sapam

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
import string
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv("spam.csv",encoding="latin1")

In [3]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1,inplace=True)

In [5]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
#create a column to  keep the count of the characters present in each record
data["Length"]=data["v2"].apply(len)

In [7]:
data.head()

Unnamed: 0,v1,v2,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [13]:
data.describe()

Unnamed: 0,Length
count,5572.0
mean,80.118808
std,59.690841
min,2.0
25%,36.0
50%,61.0
75%,121.0
max,910.0


In [15]:
data["v1"].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

# Text Pre-Processing

In [17]:
data.loc[data["v1"]=="ham","class"]=1

In [18]:
data.loc[data["v1"]=="ham","class"]=0

In [8]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
#step1: Remove punctuation
def remove_punctuation(text):
    text="".join([char for char in text if char not in string.punctuation])
    return text

In [11]:
s = "data//science!!"
remove_punctuation(s)

'datascience'

In [12]:
text=[]
for i in data["v2"]:
    t=remove_punctuation(i)
    text.append(t)

In [14]:
data["text_clean"]= text

In [15]:
data

Unnamed: 0,v1,v2,Length,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",111,Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,29,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,49,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,Nah I dont think he goes to usf he lives aroun...
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,161,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?,37,Will Ì b going to esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",57,Pity was in mood for that Soany other suggest...
5570,ham,The guy did some bitching but I acted like i'd...,125,The guy did some bitching but I acted like id ...


In [16]:
x = data["text_clean"].values
y = data["v1"].values
x

array(['Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat',
       'Ok lar Joking wif u oni',
       'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive entry questionstd txt rateTCs apply 08452810075over18s',
       ..., 'Pity  was in mood for that Soany other suggestions',
       'The guy did some bitching but I acted like id be interested in buying something else next week and he gave it to us for free',
       'Rofl Its true to its name'], dtype=object)

In [45]:
y=data["v1"].replace({"ham":1,"spam":0})

In [52]:
y=y.astype(int)

In [53]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)
x_train.shape

(4457,)

In [54]:
x_test.shape

(1115,)

# Bag of words

In [55]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
CV=CountVectorizer(stop_words="english")

In [56]:
x_train_cv=CV.fit_transform(x_train)

In [57]:
import warnings
warnings.filterwarnings("ignore")
CV.get_feature_names_out()

array(['008704050406', '0089my', '0121', ..., 'ûïharry', 'ûò', 'ûówell'],
      dtype=object)

# Training a model

In [58]:
NB=MultinomialNB()

In [59]:
NB.fit(x_train_cv,y_train)

In [60]:
x_test_cv=CV.transform(x_test)

In [61]:
y_predict= NB.predict(x_test_cv)
y_predict

array([1, 1, 1, ..., 1, 1, 1])

In [62]:
from sklearn.metrics import classification_report

In [63]:
print(classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       150
           1       0.99      0.99      0.99       965

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [64]:
pd.crosstab(y_test,y_predict)

col_0,0,1
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,9
1,7,958


In [66]:
#Intialising a model
bnb=BernoulliNB()

bnb.fit(x_train_cv,y_train)

y_hat1=bnb.predict(x_test_cv)

pd.crosstab(y_test,y_hat1)

col_0,0,1
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,114,36
1,0,965


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()

In [69]:
x_train_cv=tf.fit_transform(x_train)
x_test_cv= tf.transform(x_test)

In [70]:
nb=MultinomialNB()
nb.fit(x_train_cv,y_train)

In [71]:
y_hat=nb.predict(x_test_cv)

In [72]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.65      0.79       150
           1       0.95      1.00      0.97       965

    accuracy                           0.95      1115
   macro avg       0.97      0.83      0.88      1115
weighted avg       0.96      0.95      0.95      1115



In [73]:
pd.crosstab(y_test,y_hat)

col_0,0,1
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,98,52
1,0,965


In [77]:
nb=BernoulliNB()
nb.fit(x_train_cv,y_train)
y_hat=nb.predict(x_test_cv)

In [78]:
y_hat

array([1, 1, 1, ..., 1, 1, 1])

In [79]:
print(classification_report(y_test,y_hat))

              precision    recall  f1-score   support

           0       1.00      0.78      0.88       150
           1       0.97      1.00      0.98       965

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [80]:
pd.crosstab(y_test,y_hat)

col_0,0,1
v1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,117,33
1,0,965
