### Spam SMS data analysis

Dataset info: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

In [1]:
import pandas as pd
import nltk
import spacy
from collections import Counter
#TODO: install pytorch?
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\erez1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# load the data
spam_data = pd.read_csv('spam.csv', encoding='latin-1')
spam_data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
# drop unnecessary columns
spam_data = spam_data[['v1', 'v2']]

In [4]:
# Number of sms messages
num_messages = spam_data.shape[0]
print("We have a total of {} messages".format(num_messages))
# map ham to 0 and spam to 1
spam_data['v1'] = spam_data['v1'].map({'ham': 0, 'spam': 1})
spam_data.head(10)

We have a total of 5572 messages


Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [5]:
# Compute number of spam messages
num_spam = spam_data['v1'].sum()
print("We have a total of {} spam messages".format(num_spam))
messages = spam_data['v2'].values
num_of_words = sum([len(message.split(" ")) for message in messages])
print("We have a total of {} words".format(num_of_words))
avg_num_words = num_of_words / num_messages
print("On average, we have {} words per message".format(avg_num_words))

We have a total of 747 spam messages
We have a total of 86961 words
On average, we have 15.60678391959799 words per message


In [6]:
# all words counter
words_counter = {}
for message in messages:
    for word in message.split(" "):
        if word not in words_counter:
            words_counter[word] = 1
        else:
            words_counter[word] += 1
# print 5 most common words
print("5 most common words:")
print(sorted(words_counter.items(), key=lambda x: x[1], reverse=True)[:5])

appear_once_words = [word for word, count in words_counter.items() if count == 1]
print("Number of words that appear once: {}".format(len(appear_once_words)))

5 most common words:
[('to', 2134), ('you', 1622), ('I', 1466), ('a', 1327), ('the', 1197)]
Number of words that appear once: 9270


### Tokenization: NLTK VS Spacy

In [7]:
all_messages = " ".join(messages)
# perform tokenization using nltk
res = nltk.word_tokenize(all_messages)
print(res[0:100])

['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...', 'Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s", 'U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', "n't", 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though', 'FreeMsg', 'Hey', 'there']


In [8]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(all_messages)
print(list(tokens)[0:100])

[Go, until, jurong, point,, crazy.., Available, only, in, bugis, n, great, world, la, e, buffet..., Cine, there, got, amore, wat..., Ok, lar..., Joking, wif, u, oni..., Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005., Text, FA, to, 87121, to, receive, entry, question(std, txt, rate)T&C's, apply, 08452810075over18's, U, dun, say, so, early, hor..., U, c, already, then, say..., Nah, I, don't, think, he, goes, to, usf,, he, lives, around, here, though, FreeMsg, Hey, there, darling, it's, been, 3, week's, now, and, no, word, back!, I'd, like, some, fun, you, up, for, it, still?]


### Question 5
The main differences we have spotted are:
1. nltk tokenizer takes punctuation as separated tokens, while spacy's tokenizer concat them to the nearest word
2. spacy's tokenizer chose to remain words with ' like "don't" as a whole token, while nltk tokenizer separated it to 'do' and "n't"

### lemmatization: NLTK VS Spacy

In [18]:
# perform lemmatization using nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
words_per_message = [message.split() for message in messages]
words = [word for message_words in words_per_message for word in message_words]
lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
print(messages)
print(lemmatized_words[0:120])

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'
 'Ok lar... Joking wif u oni...'
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
 ... 'Pity, * was in mood for that. So...any other suggestions?'
 "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free"
 'Rofl. Its true to its name']
['Go', 'until', 'jurong', 'point,', 'crazy..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'Cine', 'there', 'get', 'amore', 'wat...', 'Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question(std', 'txt', "rate)T&C's", 'apply', "08452810075over18's", 'U', 'd

In [19]:
# perform lemmatization using spacy
nlp = spacy.load('en_core_web_sm')
lemmatized_words_spacy = [
    token.lemma_
    for message in messages
    for token in nlp(message)
]

print(lemmatized_words_spacy[:100])

KeyboardInterrupt: 

### Question 6
The main differnces are: 
1. Running time: Spacy is much slower due to running ML models in order to return a more accurate results. For instance, the POS tagging, which predicts the grammatical role of each word (noun, verb, etc..)
2. Accuracy: 