In [59]:
#importing required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [60]:
#loading the dataset
mail_data=pd.read_csv('spam_ham_dataset.csv')

In [61]:
#replacing the null values in the dataset with empty string
data=mail_data.where((pd.notnull(mail_data))," ")

In [62]:
#shape of the data
data.shape

(5171, 4)

In [63]:
#dataset sample with pandas dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [64]:
X=data['text']
print(X)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object


In [65]:
Y=data['label_num']
print(Y)

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


In [66]:
#splitting the data with 80% as train data and 20% as test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=3)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(4136,) (1035,)
(4136,) (1035,)


In [67]:
#transforming the data to feature vectors to use them as inputs for SVC  model using TfidfVectorizer
feature_extraction=TfidfVectorizer(min_df=1,lowercase='True',stop_words='english')
#transforming text data
X_train_feature=feature_extraction.fit_transform(X_train)
X_test_feature=feature_extraction.transform(X_test)

In [68]:
#transforming label data into int type
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [74]:
#initializing SVC model and fitting train data to it
model=LinearSVC()
model=model.fit(X_train_feature, Y_train)

In [78]:
#Evaluating the model
#Predicitng the accuracy of train data
train_accuracy_prediction=model.predict(X_train_feature)
train_accuracy=accuracy_score(train_accuracy_prediction,Y_train)
print("Accuracy on train data : ",train_accuracy)

Accuracy on train data :  1.0


In [79]:
#Prediciting the accuracy of test data
test_accuracy_prediction=model.predict(X_test_feature)
test_accuracy=accuracy_score(test_accuracy_prediction,Y_test)
print("Accuracy on test data : ",test_accuracy)

Accuracy on test data :  0.9864734299516909


In [84]:
#Predicting the mails as SPAM or HAM
input_mail=["There's so much to do in the pursuit of an enriching life. If you have an insatiable zest for life, gratify your desires now, with OlaMoney Postpaid+. It's a service that lets you buy now and pay later across platforms and apps. You can use OlaMoney Postpaid+ to shop, eat, travel and so on, on apps like Myntra, MakeMyTrip, AbhiBus, Zomato, bigbasket and 15K+ others. So don't waste time. Life is too beautiful to be left for tomorrow. Seize the day with OlaMoney Postpaid+."]
#transforming the given data into feature vectors to input them into model
input_mail=feature_extraction.transform(input_mail)

prediction=model.predict(input_mail)
print(prediction)

if prediction==[0]:
  print("This is a HAM mail")
if prediction==[1]:
  print("This is a SPAM mail")

[1]
This is a SPAM mail
