In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
!pip install PyDrive 



In [9]:
# code to read csv from drive file to colaboratory

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [30]:
#We import punctuation variables
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Authentication

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Now let us download the dataset. For that get the ID of your csv file from google drive.

In [14]:
#Go to the file in your drive, click get link. The ID will be a part of your link
downloaded = drive.CreateFile({'id': "1ysIzIy9Sggh-dlCtVV5ZFdQ4m7wqyWIT" }) 
downloaded.GetContentFile('spam.csv')

In [17]:
#The encoding part is not necessacry if your dataset is encoded in english
df = pd.read_csv('spam.csv', encoding='latin-1')
df

Unnamed: 0,class,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


Checking for null values

In [18]:
df.isnull().sum()

class            0
message          0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

We have 3 unnecessary columns. We can remove them as we need only the first two i.e. the class and message fields.

In [36]:
df = df.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)
df.head()

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [38]:
labels = df['class']
labels

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: class, Length: 5572, dtype: object

Rhere are a total of 4825 ham messages and 747 spam messages.

In [40]:
df['class'].value_counts()


ham     4825
spam     747
Name: class, dtype: int64

In [53]:
#converting class values to numerical values for ease of use
df['class'] = df['class'].map({'ham':0, 'spam':1})

Preprocessing

In [65]:
#Splitting the dataset into training and testing sets. I have used 67% training and 34% testing set size which is ideal.
x_train, x_test, y_train, y_test = train_test_split(df['message'], labels, test_size=0.33, random_state=15)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3733,)
(1839,)
(3733,)
(1839,)


In [66]:
#CountVectorizer preprocesses the data and converts the dataset into a sparse matrix
#which is supposed to be the input format for naive bayes algorithm.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

#this fits and transforms train set, transform test set
X = vectorizer.fit_transform(x_train)
Y = vectorizer.transform(x_test)

In [67]:
#The training model
X

<3733x7062 sparse matrix of type '<class 'numpy.int64'>'
	with 49455 stored elements in Compressed Sparse Row format>

In [68]:
#The testing model
Y

<1839x7062 sparse matrix of type '<class 'numpy.int64'>'
	with 22629 stored elements in Compressed Sparse Row format>

Naive Bayes classifiers are a popular statistical technique of e-mail filtering. They use tokens related to spam and not spam email messages and use bayes algorithm to check if the email is a spam or not. We already have a readily available function in sklearn for the very same purpose. MultinomialNB is the function we will be using.

In [70]:
from sklearn.naive_bayes import MultinomialNB

#let us initiate the naive Bayes model
nb = MultinomialNB()

In [71]:
#Training the model
nb.fit(X, y_train, sample_weight=None)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [74]:
#Making class predictions from the testing sparse matrix set.
y_pred_class = nb.predict(Y)

In [75]:
#We will now check the predicted values with the actual values and calculate the accuracy.
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9809679173463839

As You can see the accuracy is 98%