### Spam SMS data analysis

Dataset info: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [50]:
import pandas as pd
import nltk
import spacy
from collections import Counter
#TODO: install pytorch?
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\erez1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [51]:
# load the data
spam_data = pd.read_csv('spam.csv', encoding='latin-1')
spam_data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [52]:
# drop unnecessary columns
spam_data = spam_data[['v1', 'v2']]

In [53]:
# Number of sms messages
num_messages = spam_data.shape[0]
print("We have a total of {} messages".format(num_messages))
# map ham to 0 and spam to 1
spam_data['v1'] = spam_data['v1'].map({'ham': 0, 'spam': 1})
spam_data.head(10)

We have a total of 5572 messages


Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [54]:
# Compute number of spam messages
num_spam = spam_data['v1'].sum()
print("We have a total of {} spam messages".format(num_spam))
messages = spam_data['v2'].values
num_of_words = sum([len(message.split(" ")) for message in messages])
print("We have a total of {} words".format(num_of_words))
avg_num_words = num_of_words / num_messages
print("On average, we have {} words per message".format(avg_num_words))

We have a total of 747 spam messages
We have a total of 86961 words
On average, we have 15.60678391959799 words per message


In [55]:
# all words counter
words_counter = {}
for message in messages:
    for word in message.split(" "):
        if word not in words_counter:
            words_counter[word] = 1
        else:
            words_counter[word] += 1
# print 5 most common words
print("5 most common words:")
print(sorted(words_counter.items(), key=lambda x: x[1], reverse=True)[:5])

appear_once_words = [word for word, count in words_counter.items() if count == 1]
print("Number of words that appear once: {}".format(len(appear_once_words)))

5 most common words:
[('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of words that appear once: 9270


### Tokenization: NLTK VS Spacy

In [61]:
all_messages = " ".join(messages)
# perform tokenization using nltk
res = nltk.word_tokenize(all_messages)
print(res[0:120])

['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...', 'Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s", 'U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', "n't", 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though', 'FreeMsg', 'Hey', 'there', 'darling', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'I', "'d", 'like', 'some', 'fun', 'you', 'up']


In [62]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(all_messages)
print(list(tokens)[0:120])

[Go, until, jurong, point,, crazy.., Available, only, in, bugis, n, great, world, la, e, buffet..., Cine, there, got, amore, wat..., Ok, lar..., Joking, wif, u, oni..., Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005., Text, FA, to, 87121, to, receive, entry, question(std, txt, rate)T&C's, apply, 08452810075over18's, U, dun, say, so, early, hor..., U, c, already, then, say..., Nah, I, don't, think, he, goes, to, usf,, he, lives, around, here, though, FreeMsg, Hey, there, darling, it's, been, 3, week's, now, and, no, word, back!, I'd, like, some, fun, you, up, for, it, still?, Tb, ok!, XxX, std, chgs, to, send,, å£1.50, to, rcv, Even, my, brother, is, not, like, to, speak, with, me.]


### Question 5
The main differences we have spotted are:
1. nltk tokenizer takes punctuation as separated tokens, while spacy's tokenizer concat them to the nearest word
2. spacy's tokenizer chose to remain words with ' like "don't" as a whole token, while nltk tokenizer separated it to 'do' and "n't"

### lemmatization: NLTK VS Spacy

In [None]:
# perform lemmatization using nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words_nltk = [lemmatizer.lemmatize(token) for token in res]

['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...', 'Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s", 'U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', "n't", 'think', 'he', 'go', 'to', 'usf', ',', 'he', 'life', 'around', 'here', 'though', 'FreeMsg', 'Hey', 'there', 'darling', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'I', "'d", 'like', 'some', 'fun', 'you', 'up']


In [None]:
# perform lemmatization using spacy
nlp = spacy.load('en_core_web_sm')
lemmatized_words_spacy = [
    token.lemma_
    for message in messages
    for token in nlp(message)
]

['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'get', 'amore', 'wat', '...', 'ok', 'lar', '...', 'joke', 'wif', 'u', 'oni', '...', 'free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receive', 'entry', 'question(std', 'txt', 'rate)T&C', "'s", 'apply', '08452810075over18', "'s", 'u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', 'not', 'think', 'he', 'go', 'to', 'usf', ',', 'he', 'live', 'around', 'here', 'though', 'FreeMsg', 'hey', 'there', 'darle', 'it', 'be', 'be', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'I', 'would', 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', '?', 'tb', 'ok']


In [80]:
print("NLTK lemmatized words:")
print(lemmatized_words_nltk[:120])
print("Length of NLTK lemmatized words:")
print(len(lemmatized_words_nltk))
print("Spacy lemmatized words:")
print(lemmatized_words_spacy[:120])
print("Length of Spacy lemmatized words:")
print(len(lemmatized_words_spacy))

NLTK lemmatized words:
['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...', 'Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s", 'U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', "n't", 'think', 'he', 'go', 'to', 'usf', ',', 'he', 'life', 'around', 'here', 'though', 'FreeMsg', 'Hey', 'there', 'darling', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'I', "'d", 'like', 'some', 'fun', 'you', 'up']
Length of NLTK lemmatized words:
104164
Spacy lemmatized words:
['go', 'until', 'jurong', 'poi

### Question 6
The main differnces are: 
1. Running time: Spacy is much slower due to running ML models in order to return a more accurate results. For instance, the POS tagging, which predicts the grammatical role of each word (noun, verb, etc..)
2. nltk lemmatizer returned words like "it's" while spacy splitted it to "it", "be" ('s -> is -> be).
3. nltk's lemmatizer didn't lemmatize the work 'joking' (stayed the same) while spacy's lemmatizer change it to 'joke'. 
4. nltk's returned the 'me' while spacy returned 'i'.

### Stemming: NLTK VS Spacy

In [88]:
ls = nltk.LancasterStemmer()
# nltk stemming
stemmed_words_nltk = [ls.stem(word) for word in lemmatized_words_nltk]
#spacy stemming
stemmed_words_spacy = [ls.stem(word) for word in lemmatized_words_spacy]

In [89]:
print("NLTK stemming")
print(stemmed_words_nltk[:120])
print("Length of NLTK stemmed words: {}".format(len(stemmed_words_nltk)))
print("Spacy stemming")
print(stemmed_words_spacy[:120])
print("Length of Spacy stemmed words: {}".format(len(stemmed_words_spacy)))

NLTK stemming
['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'avail', 'on', 'in', 'bug', 'n', 'gre', 'world', 'la', 'e', 'buffet', '...', 'cin', 'ther', 'got', 'am', 'wat', '...', 'ok', 'lar', '...', 'jok', 'wif', 'u', 'on', '...', 'fre', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'fin', 'tkts', '21st', 'may', '2005', '.', 'text', 'fa', 'to', '87121', 'to', 'receiv', 'entry', 'quest', '(', 'std', 'txt', 'rat', ')', 't', '&', 'c', "'s", 'apply', '08452810075over18', "'s", 'u', 'dun', 'say', 'so', 'ear', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...', 'nah', 'i', 'do', "n't", 'think', 'he', 'go', 'to', 'usf', ',', 'he', 'lif', 'around', 'her', 'though', 'freemsg', 'hey', 'ther', 'darl', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'i', "'d", 'lik', 'som', 'fun', 'you', 'up']
Length of NLTK stemmed words: 104164
Spacy stemming
['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'avail', 'on', 'in', 'bug', 'n', 

### Question 7
There is no significant differences between the two results. 
We have noticed that the nltk's LancasterStemmer has lower cased all word's first letter. 

### Question 8
As we can see from out results, the length

In [108]:
# Identify a spam message where its removal from the spam.csv dataset would:
# a) Reduce the total number of stemmed tokens
# b) Maintain the exact same number of lemmatized tokens
count = 0
spam_messages = spam_data[spam_data['v1'] == 1]['v2'].values
for message in spam_messages:
    stem_message = ls.stem(message)
    stem_len = len(stem_message)
    llematized_message = lemmatizer.lemmatize(message)
    llem_len = len(llematized_message)
    if(llem_len != stem_len):
        count += 1
        print("Lemmatized message:")
        print(llematized_message)
        print("Stemmed message:")
        print(stem_message)
        print("The stemmed message length is: {}".format(stem_len))
        print("The lemmatized message length is: {}".format(llem_len))

print("Number of spam messages where the number of lemmatized tokens is different from the stemmed tokens: {}".format(count))

Lemmatized message:
Sunshine Quiz Wkly Q! Win a top Sony DVD player if u know which country the Algarve is in? Txt ansr to 82277. å£1.50 SP:Tyrone
Stemmed message:
sunshine quiz wkly q! win a top sony dvd player if u know which country the algarve is in? txt ansr to 82277. å£1.50 sp:tyron
The stemmed message length is: 125
The lemmatized message length is: 126
Lemmatized message:
UpgrdCentre Orange customer, you may now claim your FREE CAMERA PHONE upgrade for your loyalty. Call now on 0207 153 9153. Offer ends 26th July. T&C's apply. Opt-out available
Stemmed message:
upgrdcentre orange customer, you may now claim your free camera phone upgrade for your loyalty. call now on 0207 153 9153. offer ends 26th july. t&c's apply. opt-out availabl
The stemmed message length is: 174
The lemmatized message length is: 175
Lemmatized message:
Boltblue tones for 150p Reply POLY# or MONO# eg POLY3 1. Cha Cha Slide 2. Yeah 3. Slow Jamz 6. Toxic 8. Come With Me or STOP 4 more tones txt MORE
Stemmed m

In [109]:
# Loop over all the mesages if the message isnt spam - than add all the tokens of llematized and stemming that didnt appear in the arrays(stemming tokens and llematized tokens)
# if the message is spam - than check if all tokens are in the stemming and lemmatized arrays