# Classifier

First we open the SMS Spam Collection v.1 dataset. The corpus has been collected by Tiago Agostinho de Almeida (http://www.dt.fee.unicamp.br/~tiago) and José María Gómez Hidalgo (http://www.esp.uem.es/jmgomez), and can be found here: (https://archive.ics.uci.edu/ml/datasets/sms+spam+collection#)

We need a simple classifier example to illustrate our algorithm. We've chosen the tutorial by KDnuggets titled _Spam Filter in Python: Naive Bayes from Scratch_ for our example.

In [1758]:
import pandas as pd

sms_spam = pd.read_csv('SMSSpamCollection', sep='\t', header = None, names = ["Label", "SMS"])

print(sms_spam.shape)
sms_spam.head()

(5572, 2)


Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Now we count the number of spam and ham SMS. 

In [1759]:
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: Label, dtype: float64

## Data Set Preparation
Now we randomize the data and prepare the dataset.

In [1760]:
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)

training_set_size_percentage = 3 / 100

# Calculate index for split
training_test_index = round(len(data_randomized) * training_set_size_percentage)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

test_set_untouched = test_set.copy()
print(training_set.shape)
print(test_set.shape)

(167, 2)
(5405, 2)


In [1761]:
training_set['Label'].value_counts(normalize=True)

ham     0.874251
spam    0.125749
Name: Label, dtype: float64

In [1762]:
test_set['Label'].value_counts(normalize=True)

ham     0.86568
spam    0.13432
Name: Label, dtype: float64

### Data cleaning
We remove all the punctuation and make everything lowercase.

In [1763]:
training_set['SMS'] = training_set['SMS'].str.replace(
   '\W', ' ') # Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head(3)

  training_set['SMS'] = training_set['SMS'].str.replace(


Unnamed: 0,Label,SMS
0,ham,yep by the pretty sculpture
1,ham,yes princess are you going to make me moan
2,ham,welp apparently he retired


### Creating the vocabulary
We make a set of all unique words in our data.

In [1764]:
training_set['SMS'] = training_set['SMS'].str.split()

vocabulary = []
for sms in training_set['SMS']:
    for word in sms:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))
len(vocabulary)

1012

In [1765]:
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}

for index, sms in enumerate(training_set['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [1766]:
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()

Unnamed: 0,of,part,lays,lessons,look,help,aft,gettin,made,europe,...,very,pick,nap,rate,k61,sim,list,mgs,morning,into
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1767]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,Label,SMS,of,part,lays,lessons,look,help,aft,gettin,...,very,pick,nap,rate,k61,sim,list,mgs,morning,into
0,ham,"[yep, by, the, pretty, sculpture]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[yes, princess, are, you, going, to, make, me,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ham,"[welp, apparently, he, retired]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,[havent],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[i, forgot, 2, ask, ü, all, smth, there, s, a,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We use Laplace smoothing with $\alpha$ = 1

In [1768]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

### Calculating Parameters

In [1769]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
    n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
    p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
    parameters_spam[word] = p_word_given_spam

    n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
    p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
    parameters_ham[word] = p_word_given_ham

In [1770]:
import re

def classify(message):
    '''
    message: a string
    '''

    message = re.sub('\W', ' ', message)
    message = message.lower().split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham: 
            p_ham_given_message *= parameters_ham[word]
    
    p_h = p_ham_given_message / (p_spam_given_message + p_ham_given_message)
    p_s = p_spam_given_message / (p_spam_given_message + p_ham_given_message)
    
    return p_s, p_h
    #return p_spam_given_message, p_ham_given_message
    
    

### Testing the classifier

In [1771]:
def classify_and_print(message):
    p_spam_given_message, p_ham_given_message = classify(message)
    
    
    
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)
    
    
    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal proabilities, have a human classify this!')

In [1772]:
classify_and_print('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 0.016576949832007425
P(Ham|message): 0.9834230501679925
Label: Ham


In [1773]:
def classify_test_set(message):
    
    p_spam_given_message, p_ham_given_message = classify(message)
    
    
    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'
        
    '''
    
    p_h = p_ham_given_message / (p_spam_given_message + p_ham_given_message)
    p_s = p_spam_given_message / (p_spam_given_message + p_ham_given_message)
    
    if p_h > p_s:
        return 'ham'
    elif p_s > p_h:
        return 'spam'
    else:
        return 'needs human classification'
    '''

In [1774]:
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"My Parents, My Kidz, My Friends n My Colleague...",ham
1,ham,"Yeah go on then, bored and depressed sittin wa...",ham
2,ham,Can you plz tell me the ans. BSLVYL sent via f...,ham
3,ham,Lol yes. Our friendship is hanging on a thread...,ham
4,ham,cool. We will have fun practicing making babies!,ham


### Calculating Accuracy

In [1775]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 5170
Incorrect: 235
Accuracy: 0.9565217391304348


Now we calculate the number of false positives and false negatives.

In [1776]:
f_pos = 0
f_neg = 0

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == "ham" and row['predicted'] == "spam":
        f_pos += 1
    if row['Label'] == "spam" and row['predicted'] == "ham":
        f_neg += 1

print('False positives: (when a true ham is falsly flagged as spam)', f_pos)  
print('False negatives: (when a true Spam is falsly allowed as ham)', f_neg)

False positives: (when a true ham is falsly flagged as spam) 27
False negatives: (when a true Spam is falsly allowed as ham) 207


# FGC-Classify
Now we investigate the same classification when we run it through FGC-Classify algorithm. We try and reduce the false negatives by setting the utilities appropriately.

![Image: Classification Game](./game.png)

In [1777]:
# Make a copy of the original test set
test_set2 = test_set_untouched.copy()

In [1778]:
def fgc_classify_test_set(message, utility):
     
    # utilities are (HS, HH, SS, SH) tuple
    
    p_spam_given_message, p_ham_given_message = classify(message)
    
    p_h = p_ham_given_message #/ (p_spam_given_message + p_ham_given_message)
    p_s = p_spam_given_message #/ (p_spam_given_message + p_ham_given_message)
    
    #print (p_h, p_s, p_h+p_s)
    
    payoff = utility[0] * p_s, utility[1] * p_h, utility[2] * p_s, utility[3] * p_h
    
    #print(payoff)
    
    if max(payoff) == payoff[0] or max(payoff) == payoff[2]:
        return 'spam'
    elif max(payoff) == payoff[1] or max(payoff) == payoff[3]:
        return 'ham'
    else:
        return 'needs human classification'

## Testing the FGC-Classifier

In [1779]:
print(fgc_classify_test_set('WINNER!! This is the secret code to unlock the money: C3421.', (1,1,1,1)))

ham


### New Test Set

In [1780]:
test_util = (1,10,10,2)

def fgc(msg):
    return fgc_classify_test_set(msg, test_util)

test_set2['predicted'] = test_set2['SMS'].apply(fgc)
test_set2.head()

Unnamed: 0,Label,SMS,predicted
0,ham,"My Parents, My Kidz, My Friends n My Colleague...",ham
1,ham,"Yeah go on then, bored and depressed sittin wa...",ham
2,ham,Can you plz tell me the ans. BSLVYL sent via f...,ham
3,ham,Lol yes. Our friendship is hanging on a thread...,ham
4,ham,cool. We will have fun practicing making babies!,ham


In [1781]:
correct = 0
total = test_set2.shape[0]

for row in test_set2.iterrows():
    row = row[1]
    if row['Label'] == row['predicted']:
        correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

f_pos = 0
f_neg = 0

for row in test_set2.iterrows():
    row = row[1]
    if row['Label'] == "ham" and row['predicted'] == "spam":
        f_pos += 1
    if row['Label'] == "spam" and row['predicted'] == "ham":
        f_neg += 1

print('False positives: (when a true ham is falsly flagged as spam)', f_pos)  
print('False negatives: (when a true Spam is falsly allowed as ham)', f_neg)


Correct: 5170
Incorrect: 235
Accuracy: 0.9565217391304348
False positives: (when a true ham is falsly flagged as spam) 27
False negatives: (when a true Spam is falsly allowed as ham) 207


# Reducing False Positives

In [1782]:
def run_fgc_for_util(util):
    # Make a copy of the original test set
    test_set_u = test_set_untouched.copy()
    
    def fgc(msg):
        return fgc_classify_test_set(msg, util)

    test_set_u['predicted'] = test_set_u['SMS'].apply(fgc)
    test_set_u.head()
    
    correct = 0
    total = test_set_u.shape[0]
    f_pos = 0
    f_neg = 0

    for row in test_set_u.iterrows():
        row = row[1]
        if row['Label'] == row['predicted']:
            correct += 1
        if row['Label'] == "ham" and row['predicted'] == "spam":
            f_pos += 1
        if row['Label'] == "spam" and row['predicted'] == "ham":
            f_neg += 1

    return correct, total - correct, correct/total, f_pos, f_neg
    print('Correct:', correct)
    print('Incorrect:', total - correct)
    print('Accuracy:', correct/total)
    print('False positives: (when a true ham is falsly flagged as spam)', f_pos)  
    print('False negatives: (when a true Spam is falsly allowed as ham)', f_neg)
    
    

## Reducing False positives by having more utility in S,H

In [1789]:
print("Correct, Incorrect, Accuracy, F_Pos, F_Neg")
print(run_fgc_for_util((1,10,10,1)))
print(run_fgc_for_util((1,10,10,3)))
print(run_fgc_for_util((1,10,10,7)))
print(run_fgc_for_util((1,10,10,10)))
print(run_fgc_for_util((1,10,10,12)))
print(run_fgc_for_util((1,10,10,15)))
print(run_fgc_for_util((1,10,10,17)))
print(run_fgc_for_util((1,10,10,20)))
print(run_fgc_for_util((1,10,10,30)))
print(run_fgc_for_util((1,10,10,50)))
print(run_fgc_for_util((1,10,10,100)))
print(run_fgc_for_util((1,10,10,200)))

(5170, 235, 0.9565217391304348, 27, 207)
(5170, 235, 0.9565217391304348, 27, 207)
(5170, 235, 0.9565217391304348, 27, 207)
(5170, 235, 0.9565217391304348, 27, 207)
(5162, 243, 0.9550416281221091, 25, 217)
(5161, 244, 0.9548566142460685, 17, 226)
(5159, 246, 0.9544865864939871, 16, 229)
(5153, 252, 0.9533765032377428, 12, 239)
(5143, 262, 0.9515263644773359, 4, 257)
(5126, 279, 0.9483811285846439, 3, 275)
(5087, 318, 0.9411655874190564, 3, 314)
(5068, 337, 0.937650323774283, 2, 334)


### Reducing false negatives by reducing H,H vs S,H

In [1792]:
print("Correct, Incorrect, Accuracy, F_Pos, F_Neg")
print(run_fgc_for_util((1,10,100,1)))
print(run_fgc_for_util((1,10,100,3)))
print(run_fgc_for_util((1,10,1000,70)))


Correct, Incorrect, Accuracy, F_Pos, F_Neg
(5058, 347, 0.9358001850138761, 217, 129)
(5058, 347, 0.9358001850138761, 217, 129)
(4995, 410, 0.9241443108233117, 288, 121)
