In [86]:
import random
import requests
import zipfile 
import io 
import re
import math 
from typing import TypeVar, List, Tuple, NamedTuple, Set, Dict, Iterable
from collections import defaultdict

# Collecting data

In [57]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip' #collection of SMS messages
response = requests.get(url, stream = True)
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall()

In [37]:
file = open('SMSSpamCollection', 'r')
print(file.read())

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea Â£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam

In [59]:
class Message(NamedTuple):
    text: str
    is_spam: bool 

data:List[Message] = []

with open('SMSSpamCollection', 'r') as f:
    for line in f.readlines():
        message = line.split("\t") 
        if message[0] == 'ham':
            data.append(Message(message[1], False))
        else:
            data.append(Message(message[1], True))
            
data[:10]

[Message(text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', is_spam=False),
 Message(text='Ok lar... Joking wif u oni...\n', is_spam=False),
 Message(text="Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", is_spam=True),
 Message(text='U dun say so early hor... U c already then say...\n', is_spam=False),
 Message(text="Nah I don't think he goes to usf, he lives around here though\n", is_spam=False),
 Message(text="FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, Â£1.50 to rcv\n", is_spam=True),
 Message(text='Even my brother is not like to speak with me. They treat me like aids patent.\n', is_spam=False),
 Message(text="As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune fo

# Split data

In [61]:
X = TypeVar('X')
Y = TypeVar('Y')

def split_data(data:List[X], pct:float) -> Tuple[List[X], List[X]]:
    c_data = data[:]
    random.shuffle(c_data)
    cut = int(len(c_data) * pct)
    return c_data[:cut], c_data[cut:] 

def train_test_split(x:List[X], y:List[Y], test_pct:float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    idxs = [i for i in range(len(xs))] #make idxs
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    return ([x[i] for i in train_idxs], [x[i] for i in test_idxs], [y[i] for i in train_idxs], [y[i] for i in test_idxs])

In [66]:
random.seed(0)
print(len(data))
train_msgs, test_msgs = split_data(data, 0.75) 
print(len(train_msgs))
print(len(test_msgs))
print(train_msgs[0])
print(test_msgs[0]) 

5574
4180
1394
Message(text='One of best dialogue in cute reltnship..!! "Wen i Die, Dont Come Near My Body..!! Bcoz My Hands May Not Come 2 Wipe Ur Tears Off That Time..!Gud ni8\n', is_spam=False)
Message(text='Got it. Seventeen pounds for seven hundred ml â€“ hope ok.\n', is_spam=False)


# Preprocessing (Tokenization)

In [85]:
def tokenize(text:str) -> Set:
    text = text.lower()
    all_words = re.findall("[a-z0-9']+", text)
    return set(all_words)

for msg in test_msgs[:5]: print(msg.is_spam, tokenize(msg.text))

False {'pounds', 'hundred', 'hope', 'seven', 'seventeen', 'ok', 'it', 'ml', 'for', 'got'}
False {'my', 'dear', "wat's", 'doing', 'ah', 'sleeping'}
False {'huai', 'auntie', 'pick', 'up', 'her', 'phone', 'never', 'juan'}
False {'are', 'aathi', 'you', 'dear', 'where'}
False {'are', 'gt', 'you', 'x2', 'to', 'that', 'lt', 'going', 'get'}


# Naive Bayes Module

In [128]:
class NaiveBayesClassifier:
    
    def __init__(self, k:float = 0.5) -> None:
        self.k = k #smoothing parameter
        self.tokens:Set[str] = set() #create an empty set
        self.token_spam_counts:Dict[str, int] = defaultdict(int)
        self.token_ham_counts:Dict[str, int] = defaultdict(int)
        self.spam_messages = 0
        self.ham_messages = 0
        self.messages:List[Message]
        self.score:float
    
    def train(self, messages:Iterable[Message]) -> None:
        self.messages = messages
        for m in messages:
            if m.is_spam:
                self.spam_messages += 1
            else:
                self.ham_messages += 1
                
            for token in tokenize(m.text):
                self.tokens.add(token)
                if m.is_spam:
                    self.token_spam_counts[token] += 1
                else:
                    self.token_ham_counts[token] += 1 
        self._score()
                    
    def _prob(self, token:str) -> Tuple[float, float]:
        spam = self.token_spam_counts[token]
        ham = self.token_ham_counts[token]
        p_token_spam = (spam + self.k) / (self.spam_messages + 2*self.k)
        p_token_ham = (ham + self.k) / (self.ham_messages + 2*self.k) 
        return p_token_spam, p_token_ham
    
    def predict(self, text:str) -> float:
        text_tokens = tokenize(text)
        log_prob_if_spam = 0
        log_prob_if_ham = 0
        for token in self.tokens:
            prob_if_spam, prob_if_ham = self._prob(token)
            #If token appears in the message, add the prob of seeing it
            if token in text_tokens:
                log_prob_if_spam += math.log(prob_if_spam)
                log_prob_if_ham += math.log(prob_if_ham)
            #Otherwise add the log prob of not seeing it
            else:
                log_prob_if_spam += math.log(1.0 - prob_if_spam)
                log_prob_if_ham += math.log(1.0 - prob_if_ham)
        
        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_ham = math.exp(log_prob_if_ham)
        return prob_if_spam / (prob_if_spam + prob_if_ham) #We assume that any message is equally likely to be spam or not
    
    def _score(self) -> None:
        tp_tn = 0
        for m in self.messages:
            if m.is_spam == (self.predict(m.text) >= 0.5):
                tp_tn += 1 
        self.score = tp_tn / len(self.messages)

# Training

In [129]:
model = NaiveBayesClassifier(k = 0.5)
model.train(train_msgs)
print(model.score)

0.994976076555024


In [127]:
print('# of spam, ham :', model.spam_messages,',', model.ham_messages)
print(model.token_spam_counts)
print(model.token_ham_counts)

# of spam, ham : 559 , 3621


# Testing

In [131]:
score = 0
for msg in test_msgs:
    result = round(model.predict(msg.text))
    if result == msg.is_spam:
        score += 1
print('Test accuracy :', score / len(test_msgs))

Test accuracy : 0.9856527977044476
