# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [2]:
mail_data = pd.read_csv("G:/datasets/spam_ham_dataset.csv")

In [3]:
data = mail_data.where((pd.notnull(mail_data)), '')

In [4]:
data.shape

(5171, 4)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
mdata = data.drop("Unnamed: 0", axis = 1)

In [7]:
mdata.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [8]:
X = mdata['text']
Y = mdata['label_num']

In [9]:
print(X)
print("**************************************************")
print(Y)

0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object
**************************************************
0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64


# Splitting the data

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

# feature extraction

In [11]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = 'english', lowercase = True)

X_train_feature = feature_extraction.fit_transform(x_train)
X_test_feature = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

# training the model --> Support Vector Machine

In [12]:
model = LinearSVC()
model.fit(X_train_feature, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

# Evaluation of model

In [13]:
prediction_on_training_data = model.predict(X_train_feature)
accuracy_on_training_data = accuracy_score(y_train, prediction_on_training_data)

print("Accuracy on training data : ", accuracy_on_training_data)

Accuracy on training data :  1.0


In [14]:
prediction_on_testing_data = model.predict(X_test_feature)
accuracy_on_testing_data = accuracy_score(y_test, prediction_on_testing_data)

print("Accuracy on testing data : ", accuracy_on_testing_data)

Accuracy on testing data :  0.9864734299516909


In [40]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, prediction_on_testing_data)
print(cm)

[[716  13]
 [  1 305]]


# Prediction on new mail

In [15]:
input = ["congrates!! your account has been credited by 2000 rs. enjoy more exiting offers."]
input_mail = feature_extraction.transform(input)

prediction_on_new_mail = model.predict(input_mail)
print(prediction_on_new_mail)

if prediction_on_new_mail[0] == 0:
    print("Ham mail")
else:
    print("Spam mail")

[1]
Spam mail


In [23]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

In [24]:
clf.fit(X_train_feature.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [25]:
pred_train = model.predict(X_train_feature.toarray())
print("Acurracy on training data : ", accuracy_score(y_train, pred_train))

Acurracy on training data :  1.0


In [26]:
pred_test = model.predict(X_test_feature.toarray())
print("Acurracy on training data : ", accuracy_score(y_test, pred_test))

Acurracy on training data :  0.9864734299516909
