### Classification for spam messages
Using Naive Bayes algorithm to classify messages


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['Label', 'SMS'])
df.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
df.value_counts('Label', normalize=True)*100

Label
ham     86.593683
spam    13.406317
Name: proportion, dtype: float64

In [5]:
# Spliting the dataset into train (80%) and test (20%) dataset by randomizing and sampling. 
# About 80% of the dataset is 4458 while 1114 messages accounts for 20% roughly. 

randomized = df.sample( frac=1, random_state=1, ignore_index=False)
train_set = round(len(randomized)*0.8)

train_data = randomized[:train_set].reset_index(drop=True)
test_data = randomized[train_set:].reset_index(drop=True)

print(train_data.shape)
print(test_data.shape)

(4458, 2)
(1114, 2)


In [6]:
print(train_data.value_counts('Label', normalize=True)*100)
print(test_data.value_counts('Label', normalize=True)*100)

Label
ham     86.54105
spam    13.45895
Name: proportion, dtype: float64
Label
ham     86.804309
spam    13.195691
Name: proportion, dtype: float64


In [7]:
#removing punctuations from the SMS text
train_data['SMS'] = train_data['SMS'].str.replace(r'\W', ' ').str.lower()
train_data.head()

Unnamed: 0,Label,SMS
0,ham,"yep, by the pretty sculpture"
1,ham,"yes, princess. are you going to make me moan?"
2,ham,welp apparently he retired
3,ham,havent.
4,ham,i forgot 2 ask ü all smth.. there's a card on ...


In [8]:
# Extracting all the vocabulary in the SMS text
vocabulary = []
count = 0
for sms in train_data['SMS'].str.split():
    for word in sms:
        vocabulary.append(word)

print(len(vocabulary))
vocabulary = set(vocabulary)
vocabulary = list(vocabulary)
print(len(vocabulary))

69633
11860


In [9]:
for index, word in enumerate(vocabulary):
    if word == '':
        print(index, word)

In [10]:
# Transforming all the words in vocabulary into dictionary and then dataframe
word_count = {word: [0] * len(train_data['SMS']) for word in vocabulary}
count = 0
for index, sms in enumerate(train_data['SMS']):
    sms = sms.split()
    for text in sms:
        word_count[text][index] += 1

word_count_per_sms = pd.DataFrame(word_count)
word_count_per_sms.head(3)

Unnamed: 0,build,pobox12n146tf150p,cash!,told,"saturday,",simply,really.,woke,ovulate.when,"post,",...,"""margaret",arithmetic,ahhhh...just,"clothes,",join...,dem!!!,4u,haunt,lect,atten
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
word_count_per_sms['to'].value_counts()

to
0    3095
1    1038
2     247
3      53
4      17
5       5
7       1
8       1
6       1
Name: count, dtype: int64

In [12]:
# concatenating both word_count_per_sms and train_set dataframes
train_set_complete = pd.concat([train_data, word_count_per_sms], axis=1)
train_set_complete.head(3)

Unnamed: 0,Label,SMS,build,pobox12n146tf150p,cash!,told,"saturday,",simply,really.,woke,...,"""margaret",arithmetic,ahhhh...just,"clothes,",join...,dem!!!,4u,haunt,lect,atten
0,ham,"yep, by the pretty sculpture",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"yes, princess. are you going to make me moan?",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,welp apparently he retired,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
## Calculating Constants, probability shall be represented with p, ham represents non_spam sms
## p(spam), p(ham), total words in spam sms, total words in non_spam and total vocabulary

### p(spam) and p(ham)
spam_sms = train_set_complete.loc[train_set_complete['Label'] == 'spam']
total_spam_sms = len(spam_sms)
total_non_spam_sms = len(train_set_complete) - total_spam_sms
p_spam = total_spam_sms / len(train_set_complete)
p_ham = total_non_spam_sms / len(train_set_complete)
print(p_spam)
print(p_ham)

0.13458950201884254
0.8654104979811574


In [14]:
## N_spam, N_ham, N_vocabulary, alpha is Laplace smoothing parameter for Naive Bayes
alpha = 1 
n_spam = 0
n_ham = 0

for sms in spam_sms['SMS'].str.split():
    n_spam += len(sms)

ham_sms = train_set_complete.loc[train_set_complete['Label'] == 'ham']

for sms in ham_sms['SMS'].str.split():
    n_ham += len(sms)

n_vocabulary = len(vocabulary)
      
print(n_ham, n_spam)

55376 14257


In [15]:
p_ham

0.8654104979811574

In [16]:
## Calculating parameters 
ham_parameter = {word:0 for word in vocabulary}
spam_parameter = {word : 0 for word in vocabulary}

for word in vocabulary: 
    word_given_spam = spam_sms[word].sum()
    prob_word_given_spam = (word_given_spam + alpha) / (n_spam + alpha * n_vocabulary)
    spam_parameter[word] = prob_word_given_spam
    
    word_given_ham = ham_sms[word].sum()
    prob_word_given_ham = (word_given_ham + alpha)/ (n_ham + alpha  * n_vocabulary)
    ham_parameter[word] = prob_word_given_ham


    

In [17]:
## Classifying new messages 
import re

def classify(message):

    message = re.sub(r'\W', ' ', message)
    message = message.lower()
    message = message.split()
 
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in spam_parameter: 
            p_spam_given_message *= spam_parameter[word]
        if word in ham_parameter:
            p_ham_given_message *= ham_parameter[word]
        else: 
            continue

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [18]:
word1 = 'WINNER!! This is the secret code to unlock the money: C3421.'
word2 = 'Sounds good, Tom, then see u there'

classify(word1)
classify(word2)

P(Spam|message): 1.1680023632078457e-26
P(Ham|message): 6.088544142463393e-28
Label: Spam
P(Spam|message): 2.234299283967944e-26
P(Ham|message): 8.376346103813855e-22
Label: Ham


In [19]:
## measuring spam filter's accuracy 
def classify_test_set(message):

    message = re.sub(r'\W', ' ', message)
    message = message.lower()
    message = message.split()
 
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    for word in message:
        if word in spam_parameter: 
            p_spam_given_message *= spam_parameter[word]
        if word in ham_parameter:
            p_ham_given_message *= ham_parameter[word]
        else: 
            continue

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_ham_given_message < p_spam_given_message:
        return 'spam'
    else:
        return 'needs human classification'

test_data['predicted'] = train_data['SMS'].apply(classify_test_set)
test_data.head()

Unnamed: 0,Label,SMS,predicted
0,ham,Later i guess. I needa do mcat study too.,ham
1,ham,But i haf enuff space got like 4 mb...,ham
2,spam,Had your mobile 10 mths? Update to latest Oran...,ham
3,ham,All sounds good. Fingers . Makes it difficult ...,ham
4,ham,"All done, all handed in. Don't know if mega sh...",ham


In [20]:
## Measuring the accuracy
correct = 0 
total = test_data.shape[0]
for row in test_data.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

accuracy = correct/total 

print(accuracy)

0.7692998204667864
