In [25]:
import pandas as pd

df = pd.read_table('smsSpam_Collection', sep='\t', header=None, names=['label', 'sms_message'])

In [26]:
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# Converting the column label to 0 and 1
labeled = df.label.map({'ham':0, 'spam':1})

In [28]:
print(labeled)

0       0
1       0
2       1
3       0
4       0
5       1
6       0
7       0
8       1
9       1
10      0
11      1
12      1
13      0
14      0
15      1
16      0
17      0
18      0
19      1
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
5542    0
5543    0
5544    0
5545    0
5546    0
5547    1
5548    0
5549    0
5550    0
5551    0
5552    0
5553    0
5554    0
5555    0
5556    0
5557    0
5558    0
5559    0
5560    0
5561    0
5562    0
5563    0
5564    0
5565    0
5566    1
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64


In [29]:
df['label'] = labeled
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [52]:
# 2.2 Implementing Bag of Words from scratch
documents = ['Hello, how are you and and meme !',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['hello, how are you and and meme !', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [31]:
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(i.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [32]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [33]:
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [59]:
# 2.3 Implementing Bag of Words in scikit-learn
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words='english')

In [60]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [61]:
count_vector.fit(documents)
count_vector.get_feature_names()

['hello', 'home', 'meme', 'money', 'tomorrow', 'win']

In [62]:
doc_array = count_vector.transform(documents).toarray()
print(documents)
doc_array

['Hello, how are you and and meme !', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']


array([[1, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 2],
       [0, 0, 0, 0, 0, 0],
       [2, 0, 0, 0, 1, 0]])

In [63]:
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,hello,home,meme,money,tomorrow,win
0,1,0,1,0,0,0
1,0,1,0,1,0,2
2,0,0,0,0,0,0
3,2,0,0,0,1,0


In [64]:
# 3.1 Training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                   df['label'],
                                                   random_state=1)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [None]:
3.2 Applying Bag of Words processing to our dataset

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_date = count_vector.fit_transform(X_train)

Fomula for the Bayes Theorem:

P(D|Pos) = (P(D) * P(Pos|D) / P(Pos)

The probability of getting a positive test result P(Pos) can be calulated using the Sensitivity and Specificity as follows:
P(Pos) = [P(D) * Sensitivity] + [P(~D) * (1-Specificity))]

In [66]:
# P(D)
p_diabetes = 0.01

# P(-D)
p_no_diabetes = 0.99

# Sensitivicty or P(Pos|D)
p_pos_diabetes = 0.9

# Specificity or P(Neg|-D)
p_neg_no_diabetes = 0.9

# P(Pos)
p_pos = (p_diabetes * p_pos_diabetes) + (p_no_diabetes * (1 - p_neg_no_diabetes))

print('The probability of getting a positive test result P(Pos) is: {}',format(p_pos))

The probability of getting a positive test result P(Pos) is: {} 0.10799999999999998
