In [6]:
# Getting the data set
import pandas as pd
df = pd.read_table('SMSSpamCollection',
                  sep='\t',
                  header=None,
                  names=['label','sms_message'])
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Data Preprocessing 

df['label'] = df.label.map({'ham':0,'spam':1})
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Implimenting Bag of Words 

document = ['Hello, how are you?',
           'Win money, win from home!',
           'Call me now.',
           'Hello, Call hello you tomorrow!!'
           ]
lower_case_document = []
for i in document:
    lower_case_document.append(i.lower())
print(lower_case_document)

['hello, how are you?', 'win money, win from home!', 'call me now.', 'hello, call hello you tomorrow!!']


In [9]:
# Removing Punctations

punctuation_document = []
import string

for i in lower_case_document:
    punctuation_document.append(i.translate(str.maketrans('','',string.punctuation)))
print(punctuation_document)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [10]:
# Tokenization

preprocessed_document = []
for i in punctuation_document:
    preprocessed_document.append(i.split(' '))
print(preprocessed_document)

[['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [11]:
# Count Frequency 

frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_document:
    frequency_count = Counter(i)
    frequency_list.append(frequency_count)
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [12]:
# Implementing Bag of Words in scikit-Learn

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [14]:
count_vector.fit(document)
count_vector.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [15]:
doc_array = count_vector.transform(document).toarray()
doc_array

array([[1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 2, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [16]:
frequency_matrix = pd.DataFrame(doc_array,columns=count_vector.get_feature_names())

In [17]:
frequency_matrix

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [18]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                   df['label'],
                                                   random_state=1)
print('Number of rows in total : {}'.format(df.shape[0]))
print('Number of rows in training set : {}'.format(X_train.shape[0]))
print('Number of rows in Testing set : {}'.format(X_test.shape[0]))

Number of rows in total : 5572
Number of rows in training set : 4179
Number of rows in Testing set : 1393


In [19]:
count_vector = CountVectorizer()
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.fit_transform(X_test)

In [20]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [22]:
predictions = naive_bayes.predict(testing_data)

ValueError: dimension mismatch