# Spam Classifier Code using Naive Bayes and Scikit-Learn Tools

## Importing required modules

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Reading the data

In [13]:
df = pd.read_csv('SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])

df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Changing target labels to 0 and 1

In [14]:
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Splitting the data into Training and Testing data

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


## Bag of Words using sklearn CountVectorizer

In [8]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

## Fitting the data into Naive Bayes Algorithm

In [16]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## Predictions on Test data

In [17]:
predictions = naive_bayes.predict(testing_data)

## Evaluation of model

In [18]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


## Saving the Model

In [20]:
import pickle

In [22]:
# Saving
f = open('spam_classifier.pickle', 'wb')
pickle.dump(naive_bayes, f)
f.close()

In [23]:
# Loading Later
f = open('spam_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

In [24]:
new_predictions = classifier.predict(testing_data)

In [29]:
new_predictions[511]

0

## Checking on some input string

In [46]:
strr = input('Enter your message')

Enter your messageWin Win Win Win


In [47]:
strr

'Win Win Win Win'

In [49]:
filee = open("input.txt","w")

In [50]:
filee.write(strr)

15

In [51]:
filee.close()

In [52]:
readd = open("input.txt","r")

In [53]:
df2 = pd.read_csv('input.txt',
                   sep='\t')

In [54]:
df2.head()

Unnamed: 0,Win Win Win Win


In [56]:
tes_data = count_vector.transform(df2)

In [57]:
new_prediction = classifier.predict(tes_data)

In [58]:
new_prediction[0]

1