In [75]:
#import Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
# Dataset
Data = pd.read_csv('/content/sample_data/spam.csv')

In [77]:
Data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [78]:
Data=Data.replace({'Label': {'spam':1,'ham':0}})

In [79]:
Data.head()

Unnamed: 0,Label,EmailText
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [80]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5572 non-null   int64 
 1   EmailText  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [81]:
#Check Null Values

Data.isnull().sum()

Label        0
EmailText    0
dtype: int64

In [82]:
Data.describe()

Unnamed: 0,Label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [83]:
# split the dataset

X =  Data['EmailText'].values
y =  Data['Label'].values

In [84]:
Data['Label'].value_counts()  #Acording to the output this data set is imbalanced dataset

0    4825
1     747
Name: Label, dtype: int64

In [85]:
#Converting text(EmailText) into integer using CountVectorizer()

CountVectorizer = CountVectorizer() 
X_new = CountVectorizer.fit_transform(X)

In [86]:
#This data set is imbalnced ,so we can use SMOTE to balanced the data set
from imblearn.over_sampling import SMOTE
smote=SMOTE()

In [87]:
X_smote,y_smote=smote.fit_resample(X_new,y)

In [88]:
# splitting dataset in to tranning and testing

X_train, X_test, y_train, y_test = train_test_split( X_smote, y_smote, test_size=0.2, random_state=0)

In [89]:
#Apply Support Vector Algorithm

from sklearn.svm import SVC
Model = SVC(kernel = 'rbf')
Model.fit(X_train, y_train)

SVC()

In [90]:
#Get Accuracy

print(Model.score(X_test,y_test))

0.9564766839378238


In [92]:
#Get the Given Mails

Given_mails = [
"Hey, you have won a car !!!!. Conrgratzz","Dear applicant, Your CV has been recieved. Best regards","You have received $1000000 to your account","Join with our whatsapp group",
"Kindly check the previous email. Kind Regards"
]

Emails = CountVectorizer.transform(Given_mails)

#Using the Model Check whether Given Emails Spam or not

prediction_value = Model.predict(Emails)
print(prediction_value)

#1 for 'Spam email'
#0 for 'ham email'

for i in prediction_value:
    if (i == 1):
      print('spam mail')
    else:
      print('ham mail')



[1 0 1 1 0]
spam mail
ham mail
spam mail
spam mail
ham mail
