# Spam Classifier

## Importing the Libraries

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [12]:
# uploading the dataset
data=pd.read_csv('spam.csv')

## Understanding the Dataset

In [13]:
# describing the datset
data.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [14]:
# shape of the dataset 
data.shape

(5572, 2)

In [15]:
# column names
data.columns

Index(['Label', 'EmailText'], dtype='object')

In [16]:
# count of unique values in column name Label
data['Label'].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

### The given data set is not balanced as there is a huge difference between the count of 'spam' and 'ham'.

## Splitting the dataset into training and test 

In [28]:
x=data["EmailText"]
y=data["Label"]

In [35]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.25,random_state=42)

## Extracting the features

In [36]:
cv = CountVectorizer()  
features = cv.fit_transform(xtrain)

## Building the Model

In [37]:
tuned_parameters = {'kernel': ['rbf','linear'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]}

classifier = GridSearchCV(svm.SVC(), tuned_parameters)
classifier.fit(features,ytrain)

# printing the best model
classifier.best_params_

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [38]:
# model score 
classifier.score(cv.transform(xtest),ytest)

0.9748743718592965