In [1]:
#import the neccessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Reading the dataset in to the dataframe using pandas
data = pd.read_csv("sms_spam.csv")
#dispalys the first five data from the data frame
data.head()

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
#checking for null values present in the dataframe
data.isna().sum()

type    0
text    0
dtype: int64

In [4]:
#Converting i,e labelling the type column to the numerical variable using map function ham as 1 and spam as 0
m_type = {"ham":1,"spam":0}
data["type"] = data["type"].map(m_type)
data.head()


Unnamed: 0,type,text
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#categorizing the dataframe independent and dependent varible into X and V
X = data["text"]
Y = data["type"]
print(X)
print(Y)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5569    This is the 2nd time we have tried 2 contact u...
5570                 Will Ã¼ b going to esplanade fr home?
5571    Pity, * was in mood for that. So...any other s...
5572    The guy did some bitching but I acted like i'd...
5573                           Rofl. Its true to its name
Name: text, Length: 5574, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5569    0
5570    1
5571    1
5572    1
5573    1
Name: type, Length: 5574, dtype: int64


In [6]:
#transform the text data to feature vectors that can be used as input to the svm model using TfidfVectorizer
#convert the text to lowercase letters
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True')
X=feature_extraction.fit_transform(X)
#Splitting the dataframe into test and train data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,train_size=0.8,test_size=0.2,random_state=100)

In [7]:
#convert y_train and y_test values to integers
Y_train=Y_train.astype('int')
Y_test=Y_test.astype('int')

In [8]:
#initialize the model
model = LinearSVC()
model.fit(X_train,Y_train)

LinearSVC()

In [9]:
#Predicted Y values for Xtrain data
predicted_y_train = model.predict(X_train)
#Checking the accuracy score of y_train vs predicted y_tain values
train_accuracy =accuracy_score(Y_train,predicted_y_train)
print(train_accuracy)

0.999775734469612


In [10]:
#predicting and checking the accuracy score for test data
predicted_y_test = model.predict(X_test)
test_accuracy = accuracy_score(Y_test,predicted_y_test)
print(test_accuracy)

0.989237668161435


Pediction on the new mail

In [11]:
input_mail = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

In [12]:
#conver the text to features using vertorization
input_mail_features = feature_extraction.transform(input_mail)

#making prediction
prediction_on_input = model.predict(input_mail_features)
print(prediction_on_input)

if (prediction_on_input[0] == 1):
  print("Ham mail")
else:
  print("Spam mail")


[0]
Spam mail


In [13]:
input_mail1 = ["As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"]

In [14]:
#conver the text to features using vertorization
input_mail_features = feature_extraction.transform(input_mail1)

#making prediction
prediction_on_input = model.predict(input_mail_features)
print(prediction_on_input)

if (prediction_on_input[0] == 1):
  print("Ham mail")
else:
  print("Spam mail")

[1]
Ham mail
