In [2]:
#importing dataset//////////////////////////////////////
import os
import pandas as pd
# Dataset from - https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
df = pd.read_table(os.path.join("smsspamcollection","SMSSpamCollection"),sep="\t",names=['label','sms_message'])
# Output printing out first 5 rows
df.head()

#data preprocessing////////////////////////////////////
df['label'] = df.label.map({'ham':0,'spam':1})


#Training & testing sets//////////////////////////////
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


#Applying bag of words processing to data//////////////
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)
# Transform testing data and return the matrix
testing_data = count_vector.transform(X_test)


#Naive Bayes based classifier using scikit-learn//////////////////
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)
#prediction
predictions = naive_bayes.predict(testing_data)


#performance metrics//////////////////////////////////////////////////////////////////////////////
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions)))
print('Recall score: ', format(recall_score(y_test,predictions)))
print('F1 score: ', format(f1_score(y_test,predictions)))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393
Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562
