<a href="https://colab.research.google.com/github/Arashghsz/SMS_SPAM---Classification/blob/Master/Final_Update_NAIVEBAYES_SMSSPAM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk import pos_tag
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
dataset_path = Path("SMSSpamCollection")

In [None]:
dataSet = pd.read_csv(dataset_path, sep='\t', header=None, names=['SMS', 'Label'])

In [None]:
# Randomize the entire data set
randomized_collection = dataSet.sample(frac=1, random_state=3)

# Calculate index for split
training_test_index = round(len(randomized_collection) * 0.7)

# Training/Test split
x_train = randomized_collection[:training_test_index].reset_index(drop=True)
x_test = randomized_collection[training_test_index:].reset_index(drop=True)

print(x_train.shape)
print(x_test.shape)

(3900, 2)
(1672, 2)


In [None]:
 print(x_train['Label'].value_counts(normalize = True))
x_test['Label'].value_counts(normalize = True)

Sorry, I'll call later                                                                                                                                                                                                              0.003333
Ok...                                                                                                                                                                                                                               0.001795
I cant pick the phone right now. Pls send a message                                                                                                                                                                                 0.001282
Wen ur lovable bcums angry wid u, dnt take it seriously.. Coz being angry is d most childish n true way of showing deep affection, care n luv!.. kettoda manda... Have nice day da.                                                 0.001026
Ok                                                  

Sorry, I'll call later                                                                                                            0.010167
I cant pick the phone right now. Pls send a message                                                                               0.004187
Gud mrng dear hav a nice day                                                                                                      0.001794
Just sleeping..and surfing                                                                                                        0.001794
Ok...                                                                                                                             0.001794
                                                                                                                                    ...   
Check mail.i have mailed varma and kept copy to you regarding membership.take care.insha allah.                                   0.000598
Okey doke. I'm at home, but

In [None]:
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
x_test['SMS'] = x_test['SMS'].str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
                                          ' ')
x_test['SMS'] = x_test['SMS'].str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
                                          ' ')
x_test['SMS'] = x_test['SMS'].str.replace(r'£|\$', ' ')    
x_test['SMS'] = x_test['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
                                          ' ')    
x_test['SMS'] = x_test['SMS'].str.replace(r'\d+(\.\d+)?', ' ')

# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
x_test['SMS'] = x_test['SMS'].str.replace(r'[^\w\d\s]', ' ')
x_test['SMS'] = x_test['SMS'].str.replace(r'\s+', ' ')
x_test['SMS'] = x_test['SMS'].str.replace(r'^\s+|\s+?$', '')

# Lowercase the entire corpus
x_test['SMS'] = x_test['SMS'].str.lower()

## removal stopwords 

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
#stop_words[:10]

In [None]:
x_test['SMS'] = x_test['SMS'].apply(lambda x: ' '.join(
    term for term in x.split() if term not in set(stop_words))
)

## Lemmatization

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
x_test['SMS'] = x_test['SMS'].apply(lambda x: ' '.join(
    lemmatizer.lemmatize(term, pos='v') for term in x.split())
)

## Stemming

In [None]:
porter = nltk.PorterStemmer()
x_test['SMS'] = x_test['SMS'].apply(lambda x: ' '.join(
    porter.stem(term) for term in x.split())
)

## Tokenization

In [None]:
x_test['SMS'] = x_test['SMS'].apply(lambda sms: nltk.word_tokenize(sms))

##  Feature Extraction
### Vectorization


In [None]:
corpus = x_test['SMS'].sum()

In [None]:
len(corpus)

1672

In [None]:
# Transform the list to a set, to remove duplicates
temp_set = set(corpus)

# Revert to a list
vocabulary = list(temp_set)

In [None]:
# Create the dictionary
len_training_set = len(x_test['SMS'])
word_counts_per_sms = {unique_word: [0] * len_training_set for unique_word in vocabulary}

for index, sms in enumerate(x_test['SMS']):
    for word in sms:
        word_counts_per_sms[word][index] += 1

In [None]:
# Convert to dataframe
word_counts = pd.DataFrame(word_counts_per_sms)

In [None]:
# Concatenate with the original training set
training_set_final = pd.concat([training_set, word_counts], axis=1)


## Calculating Constants First


In [None]:
# Filter the spam and ham dataframes
spam_df = training_set_final[training_set_final['Label'] == 'spam'].copy()
ham_df = training_set_final[training_set_final['Label'] == 'ham'].copy()

In [None]:
spam_df.shape,ham_df.shape

((530, 4), (3370, 4))

In [None]:
# Calculate P(Spam) and P(Ham)
p_spam = spam_df.shape[0] / training_set_final.shape[0]
p_ham = ham_df.shape[0] / training_set_final.shape[0]

In [None]:
print('p(spam) =',p_spam)
print('p(ham) =',p_ham)

p(spam) = 0.1358974358974359
p(ham) = 0.8641025641025641


In [None]:
# # Calculate Nspam, Nham and Nvocabulary
# spam_words_per_message = spam_df['SMS'].apply(len)
# n_spam = spam_words_per_message.sum()

# ham_words_per_message = ham_df['SMS'].apply(len)
# n_ham = ham_words_per_message.sum()

# n_vocabulary = len(vocabulary)

In [None]:
n_spam,n_ham,n_vocabulary

(7932, 27339, 5103)

In [None]:
# Opting for the Laplace smoothing
alpha = 1

In [None]:

parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}

# calculate P(wi|Spam) and P(wi|Ham)
for unique_word in vocabulary:
    p_unique_word_spam = (spam_df[unique_word].sum() + alpha) / (n_spam + alpha * n_vocabulary)
    p_unique_word_ham = (ham_df[unique_word].sum() + alpha) / (n_ham + alpha * n_vocabulary)
    
    # Update the calculated propabilities to the dictionaries
    parameters_spam[unique_word] = p_unique_word_spam
    parameters_ham[unique_word] = p_unique_word_ham

In [None]:

def sms_classify(message):
    '''
    Takes in as input a new sms (w1, w2, ..., wn),
    calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
    compares them and outcomes whether the message is spam or not.
    '''
    
    # Replace addresses (hhtp, email), numbers (plain, phone), money symbols
    message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
    message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
    message = message.replace(r'£|\$', ' ')    
    message = message.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')    
    message = message.replace(r'\d+(\.\d+)?', ' ')

    # Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
    message = message.replace(r'[^\w\d\s]', ' ')
    message = message.replace(r'\s+', ' ')
    message = message.replace(r'^\s+|\s+?$', '')

    # Lowercase the entire corpus
    message = message.lower()

    # Remove stop words    
    terms = []
    for term in message.split():
        if term not in set(stop_words):
            terms.append(term)
            message = ' '.join(terms)

    # Lemmatization
    message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in message.split())            
            
    # Stemming
    message = ' '.join(porter.stem(term) for term in message.split())  
    
    # Tokenization
    message = message.split()
    
    p_spam_given_message = p_spam
    p_ham_given_message = p_ham
    
    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]
    
        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]
    
    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal')

In [None]:
sms_classify('''Hey, Sign up with this promo code and get your card for amazing
                exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN''')

P(Spam|message): 0.1358974358974359
P(Ham|message): 0.8641025641025641
Label: Ham


In [None]:
# define the classify () function again, this time returning the outcomes
def sms_classify_test_set(message):
    '''
    Takes in as input a new sms (w1, w2, ..., wn),
    calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
    compares them and returns the spam or ham label
    '''
    
    # Replace addresses (hhtp, email), numbers (plain, phone), money symbols
    message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
    message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
    message = message.replace(r'£|\$', ' ')    
    message = message.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')    
    message = message.replace(r'\d+(\.\d+)?', ' ')

    # Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
    message = message.replace(r'[^\w\d\s]', ' ')
    message = message.replace(r'\s+', ' ')
    message = message.replace(r'^\s+|\s+?$', '')

    # Lowercase the entire corpus
    message = message.lower()
    
    # Remove stop words    
    terms = []
    for term in message.split():
        if term not in set(stop_words):
            terms.append(term)
            message = ' '.join(terms)
    
    # Lemmatization
    message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in message.split())
    
    # Stemming
    message = ' '.join(porter.stem(term) for term in message.split())
    
    # Tokenization
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    if p_ham_given_message > p_spam_given_message:
        return 'ham'
    elif p_spam_given_message > p_ham_given_message:
        return 'spam'
    else:
        return 'needs human classification'

In [None]:
test_set['sms_predicted'] = test_set['SMS'].apply(sms_classify_test_set)

In [None]:
#accuracy
tp = 0
n = test_set.shape[0]

for row in test_set.iterrows():
    row = row[1]
    if row['Label'] == row['sms_predicted']:
        tp += 1

print('TP:', tp)
print('error:', n - tp)
print('Accuracy:', tp / total)

TP: 1455
error: 217
Accuracy: 0.8702153110047847
