# Building a Spam Detector

In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep = '\t', names = ['label', 'sms_message'])

In [5]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df['label'] = df.label.map({'ham' : 0, 'spam' : 1})

In [7]:
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.shape

(5572, 2)

In [12]:
# 
# 'He' and 'he'
# 'Hello!' and 'hello'
# stop_words
print(df['sms_message'][1])

Ok lar... Joking wif u oni...


In [13]:
documents = ['Hello, how are you!', 'Win money, earn from home.', 'Call me now', 'Hello, call you tomorrow?']

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
count_vector = CountVectorizer(stop_words = 'english')

In [26]:
count_vector?

In [27]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [28]:
count_vector.fit(documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [29]:
names = count_vector.get_feature_names()

In [30]:
names

['earn', 'hello', 'home', 'money', 'tomorrow', 'win']

In [31]:
doc_array = count_vector.transform(documents).toarray()

In [32]:
doc_array

array([[0, 1, 0, 0, 0, 0],
       [1, 0, 1, 1, 0, 1],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0]])

In [33]:
frequency_matrix = pd.DataFrame(data = doc_array, columns = names)

In [34]:
frequency_matrix

Unnamed: 0,earn,hello,home,money,tomorrow,win
0,0,1,0,0,0,0
1,1,0,1,1,0,1
2,0,0,0,0,0,0
3,0,1,0,0,1,0


In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(df['sms_message'], df['label'], random_state=1)

In [37]:
print('Total number of rows: {}'.format(df.shape[0]))
print('Number of rows in the training dataset: {}'.format(x_train.shape[0]))
print('Number of rows in the testing dataset: {}'.format(x_test.shape[0]))

Total number of rows: 5572
Number of rows in the training dataset: 4179
Number of rows in the testing dataset: 1393


In [39]:
count_vector = CountVectorizer()

In [67]:
training_data = count_vector.fit_transform(x_train)

In [68]:
testing_data = count_vector.transform(x_test)

In [42]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()

In [43]:
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [44]:
predictions = naive_bayes.predict(testing_data)

In [45]:
# Accuracy = Total number of correct predictions/Total number of predictions 
# Precision = True positives/(True positives + False Positives)
# True Positives are those SMSs that were classified as spam and were actually spam
# False positives are those SMSs that were classified as spam but were not actually spam
# Recall = True Positives / (True Positives + False Negatives)

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [50]:
print('Accuracy Score: {}'.format(accuracy_score(y_test, predictions)))

Accuracy Score: 0.9885139985642498


In [51]:
print('Precision Score: {}'.format(precision_score(y_test, predictions)))

Precision Score: 0.9720670391061452


In [52]:
recall_score?

In [53]:
print('Recall Score {}'.format(recall_score(y_test, predictions)))

Recall Score 0.9405405405405406


In [55]:
f1_score?

In [56]:
print('F-Measure {}'.format(f1_score(y_test, predictions)))

F-Measure 0.9560439560439562


In [57]:
names = count_vector.get_feature_names()

In [65]:
count = 0
for i in names:
    count += 1

In [66]:
count

7456