In [28]:
#importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt


In [29]:
#reading the dataset
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
x = data['EmailText'].values #independent variable
y = data['Label'].values #dependent variable

In [31]:
#splitting the dataset into training and testing
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [32]:
#converting the independent text variable into integer
cv = CountVectorizer()
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)
#countVect.toarray()

In [33]:
#Obtaining the best parameters for the model
params = {"C":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,2.0,2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9],"kernel":["linear","poly","rbf","sigmoid"]}
model = SVC()
cval = KFold(n_splits=3) 

In [34]:
gridsearch = GridSearchCV(model,params,cv=cval)

In [35]:
results = gridsearch.fit(x_train,y_train)

In [37]:
results.best_params_

{'C': 2.2, 'kernel': 'linear'}

In [38]:
#Creating the SVM model
svcl = SVC(kernel="linear",C=2.2,probability=True,random_state=0,)

In [39]:
#Training the SVM model
svcl.fit(x_train,y_train)

SVC(C=2.2, kernel='linear', probability=True, random_state=0)

In [40]:
#predicting
y_pred = svcl.predict(x_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [41]:
#Building the confusion matrix
confusion_matrix(y_test,y_pred)

array([[946,   3],
       [ 20, 146]], dtype=int64)

In [42]:
#determing the accuracy of the model
print(accuracy_score(y_test,y_pred)*100)

97.9372197309417


In [43]:
#Classification report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       949
        spam       0.98      0.88      0.93       166

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [47]:
#making predictions for the given texts
text = ["Hey, you have won a car !!!!. Conrgratzz","Dear applicant, Your CV has been recieved. Best regards","You have received $1000000 to your account","Join with our whatsapp group","Kindly check the previous email. Kind Regards"]
values = cv.transform(text) #onverting the text into integer
svcl.predict(values)

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype=object)