In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import resample

In [2]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [3]:
bnb.score(data,target)

0.8916008614501076

# Version 1

Oversampling minority class (spam)

In [4]:
sms_raw[sms_raw.spam==False].count()

spam       4825
message    4825
click      4825
offer      4825
winner     4825
buy        4825
free       4825
cash       4825
urgent     4825
allcaps    4825
dtype: int64

In [5]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_minority_upsampled = resample(sms_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=4825,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_upsampled = pd.concat([sms_majority, sms_minority_upsampled])
 
# Display new class counts
sms_upsampled.spam.value_counts()

True     4825
False    4825
Name: spam, dtype: int64

In [6]:
data1 = sms_upsampled[keywords + ['allcaps']]
target1 = sms_upsampled['spam']


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data1, target1).predict(data1)
bnb.score(data1,target1)

0.6283937823834197

In [7]:
bnb.fit(data1,target1)

y_pred = bnb.predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 604


# Version 2 

Undersample majority class 

In [8]:
sms_raw[sms_raw.spam==True].count()

spam       747
message    747
click      747
offer      747
winner     747
buy        747
free       747
cash       747
urgent     747
allcaps    747
dtype: int64

In [9]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
sms_downsampled.spam.value_counts()

True     747
False    747
Name: spam, dtype: int64

In [10]:
data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data1, target1).predict(data1)
bnb.score(data1,target1)

0.6305220883534136

# Version 3 

Re-vamp keyword list

In [11]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['xxx', 'offer', 'ringtone', 'unsubscribe', 'sex', 'donate', 'urgent', 'free', 'baby', 'txt', 'horny']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
)

sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [12]:
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 5572 points : 542


In [13]:
bnb.score(data,target)

0.9027279253409907

# Version 4 

Holdout group 

In [14]:
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)
print('With 30% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 30% Holdout: 0.9001196172248804
Testing on Sample: 0.9027279253409907


# Version 5 

Cross Validation 

In [15]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.91039427, 0.90501792, 0.91218638, 0.89964158, 0.91935484,
       0.90125673, 0.8940754 , 0.89208633, 0.89208633, 0.89928058])

# Sampling of Dataframe

In [211]:
sms_raw[sms_raw.spam==True].sample(3)['message'].values

array(['<Forwarded from 448712404000>Please CALL 08712404000 immediately as there is an urgent message waiting for you.',
       'Congratulations - Thanks to a good friend U have WON the £2,000 Xmas prize. 2 claim is easy, just call 08712103738 NOW! Only 10p per minute. BT-national-rate',
       'Reply with your name and address and YOU WILL RECEIVE BY POST a weeks completely free accommodation at various global locations www.phb1.com ph:08700435505150p'],
      dtype=object)

# Keyword List 1 

In [49]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['click', 'offer', 'winner', 'buy', 'free', 'cash', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.7186234817813765

In [50]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.78      , 0.74      , 0.66      , 0.62      , 0.74      ,
       0.7       , 0.68      , 0.72916667, 0.75      , 0.77083333])

# Keyword List 2 

In [203]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['xxx', 'offer', 'ringtone', 'free', 'baby', 'txt', 'horny']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.7327935222672065

In [52]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.72      , 0.68      , 0.74      , 0.7       , 0.74      ,
       0.72      , 0.72      , 0.77083333, 0.72916667, 0.8125    ])

# Keyword List 3

In [53]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['xxx', 'offer', 'ringtone', 'unsubscribe', 'sex', 'donate', 'urgent']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.611336032388664

In [54]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.58      , 0.52      , 0.64      , 0.56      , 0.72      ,
       0.58      , 0.6       , 0.6875    , 0.60416667, 0.625     ])

# Keyword List 4

In [55]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['xxx', 'offer', 'sex', 'donate', 'urgent', 'free', 'baby', 'txt', 'horny']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.757085020242915

In [56]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.78      , 0.7       , 0.76      , 0.68      , 0.78      ,
       0.76      , 0.68      , 0.85416667, 0.75      , 0.8125    ])

# Keyword List 5

In [64]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['congrats', 'private', 'chat', 'prize', 
           'new', 'contact', 'tone', 'winner']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.7388663967611336

In [65]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.82      , 0.64      , 0.78      , 0.7       , 0.76      ,
       0.84      , 0.68      , 0.8125    , 0.77083333, 0.58333333])

# Final Keyword List 

In [222]:
# Grab and process the raw data.
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
             "master/sms_spam_collection/SMSSpamCollection"
            )
sms_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
sms_raw.columns = ['spam', 'message']

# Enumerate our spammy keywords.
keywords = ['free','txt', 'xxx','contact',
            'new','prize','customer','mobile',
            'play', 'caller', '0']

for key in keywords:
    sms_raw[str(key)] = sms_raw.message.str.contains(
        str(key),
        case=False
)    
sms_raw['allcaps'] = sms_raw.message.str.isupper()
sms_raw['spam'] = (sms_raw['spam'] == 'spam')
data = sms_raw[keywords + ['allcaps']]
target = sms_raw['spam']


# Separate majority and minority classes
sms_majority = sms_raw[sms_raw.spam==False]
sms_minority = sms_raw[sms_raw.spam==True]
 
# Upsample minority class
sms_majority_downsampled = resample(sms_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=747,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
sms_downsampled = pd.concat([sms_minority, sms_majority_downsampled])
 
# Display new class counts
print(sms_downsampled.spam.value_counts())

data1 = sms_downsampled[keywords + ['allcaps']]
target1 = sms_downsampled['spam']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.33, random_state=101)


from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
bnb.score(X_test, y_test)

True     747
False    747
Name: spam, dtype: int64


0.9412955465587044

In [223]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, X_test, y_test, cv=10)

array([0.92      , 0.98      , 0.98      , 0.9       , 0.88      ,
       0.98      , 0.94      , 0.9375    , 0.9375    , 0.95833333])

In [224]:
#words removed 
words_removed = ['horny', 'baby', 'urgent', 'donate', 'sex', 
                 'offer','winner', 'chat', 'tone', 
                 'congrats', 'private', 'unsubscribe'
                 'cash','click','buy','win','msg'
                 'asap','attempt','vouchers']

In [225]:
bnb.score(data,target)

0.959978463747308