## Load Dataset using Pandas

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("mail_data.csv")

## Change the string datas into integers and add it in the Data

In [2]:
data1 = pd.get_dummies(data['Category'], drop_first=True)

In [3]:
new_data =  pd.concat([data, data1], axis=1)

In [4]:
new_data.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [5]:
new_data.pop('Category')

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [6]:
new_data

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


## Split modified data into training and testing set

In [7]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(new_data, test_size=0.33, random_state=6)

In [8]:
x_train = train_set['Message']
y_train = train_set['spam']
x_test = train_set['Message']
y_test = train_set['spam']

In [9]:
y_train.describe()

count    3733.000000
mean        0.136887
std         0.343774
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: spam, dtype: float64

## Convert Message into vectors(from strings) for model training using vectorizer

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_new = vectorizer.fit_transform(x_train)
x_test_new = vectorizer.fit_transform(x_test)

In [11]:
x_test_new[0]

<1x7024 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

## Import Model and train 

In [12]:
from sklearn import svm
model = svm.SVC(kernel='rbf') 
model.fit(x_train_new, y_train)
predictions = model.predict(x_test_new)

## Check prediction accuracy of model

In [13]:
from sklearn.metrics import accuracy_score
scores = accuracy_score(y_test, predictions)

In [14]:
scores*100

99.7856951513528

## Save Model using Pickel or Joblib

In [15]:
import pickle
with open('spam.pkl', 'wb') as files:
    pickle.dump(model, files)

In [16]:
with open('spam.pkl' , 'rb') as f:
    model = pickle.load(f)