In [1]:
import pandas as pd
import re
from collections import defaultdict
import math

In [2]:
dataset = pd.read_csv("datasets/spam.csv", encoding="windows-1251")

In [3]:
df = pd.DataFrame(data={"text": [i for i in dataset["v2"]], "spam": [0 if i == "ham" else 1 for i in dataset["v1"]]})
df

Unnamed: 0,text,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will М_ b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [4]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9]+", message)
    return set(all_words)

In [5]:
def count_words(data):
    counts = defaultdict(lambda: [0,0])
    for index, row in data.iterrows():
        for word in tokenize(row['text']):
            counts[word][0 if row['spam'] else 1] +=1
    return counts

In [6]:
freq = count_words(df)
freq

defaultdict(<function __main__.count_words.<locals>.<lambda>()>,
            {'available': [3, 15],
             'only': [84, 126],
             'world': [1, 34],
             'until': [5, 22],
             'wat': [1, 101],
             'bugis': [0, 7],
             'jurong': [0, 1],
             'n': [11, 115],
             'point': [0, 13],
             'crazy': [4, 10],
             'buffet': [0, 2],
             'great': [11, 97],
             'in': [70, 735],
             'la': [0, 7],
             'cine': [0, 7],
             'got': [7, 221],
             'e': [10, 76],
             'amore': [0, 1],
             'go': [31, 231],
             'there': [16, 197],
             'lar': [0, 37],
             'wif': [0, 26],
             'u': [130, 696],
             'ok': [5, 274],
             'joking': [0, 6],
             'oni': [0, 4],
             '87121': [4, 0],
             '2': [156, 265],
             'may': [7, 41],
             'rate': [30, 2],
             'win': [62, 7],


In [7]:
all_spam = df['spam'].loc[df['spam'] == 1].count()
all_not_spam = df['spam'].loc[df['spam'] == 0].count()

In [8]:
def prob(freq, all_spam, all_not_spam, k = 1):
    return [ (word, 
            (frequency[0] + k) / (all_spam + 2*k),
            (frequency[1] + k) / (all_not_spam + 2*k))
            for word, frequency in freq.items()]

In [9]:
word_probs = prob(freq, all_spam, all_not_spam)
word_probs

[('available', 0.0053404539385847796, 0.0033146882121400456),
 ('only', 0.11348464619492657, 0.026310337683861613),
 ('world', 0.0026702269692923898, 0.00725088046405635),
 ('until', 0.00801068090787717, 0.004764864304951316),
 ('wat', 0.0026702269692923898, 0.02113113735239279),
 ('bugis', 0.0013351134846461949, 0.0016573441060700228),
 ('jurong', 0.0013351134846461949, 0.0004143360265175057),
 ('n', 0.01602136181575434, 0.02403148953801533),
 ('point', 0.0013351134846461949, 0.00290035218562254),
 ('crazy', 0.006675567423230975, 0.0022788481458462812),
 ('buffet', 0.0013351134846461949, 0.0006215040397762585),
 ('great', 0.01602136181575434, 0.02030246529935778),
 ('in', 0.09479305740987984, 0.1524756577584421),
 ('la', 0.0013351134846461949, 0.0016573441060700228),
 ('cine', 0.0013351134846461949, 0.0016573441060700228),
 ('got', 0.010680907877169559, 0.04599129894344313),
 ('e', 0.014686248331108143, 0.01595193702092397),
 ('amore', 0.0013351134846461949, 0.0004143360265175057),
 (

In [10]:
def GaussNC(word_probs, message):
#     Также обрабатываются сообщения
    message_words = tokenize(message)
#     изначально вероятности спама и неспама равны нулю
    spam_prob = not_spam_prob = 0.0
#     наращиваем вероятности в циклах
    for word, prob_if_spam, prob_if_not_spam in word_probs:
#         если слово в сообщении используем посчитанную вероятность
        if word in message_words:
            spam_prob += math.log(prob_if_spam)
            not_spam_prob += math.log(prob_if_not_spam)
#             если слова нет в сообщении используем вероятность противоположного события, т.е 1 - посчитанная вероятность
        else:
            spam_prob += math.log(1.0 - prob_if_spam)
            not_spam_prob += math.log(1.0 - prob_if_not_spam)
    e_spam_prob = math.exp(spam_prob)
    e_not_spam_prob = math.exp(not_spam_prob)
#     непосредственно формула вероятность по теореме Баейса
    prob = e_spam_prob/(e_spam_prob+e_not_spam_prob)
    if prob < 0.5:
        return f"Not spam with prob: {1 - prob}"
    else:
        return f"Spam with prob: {prob}"
    

In [11]:
result = GaussNC(word_probs, "Hi John, just letting you know I'll be a bit late for dinner tonight. See you soon!")
result

'Not spam with prob: 0.9999999999999388'

In [12]:
result = GaussNC(word_probs, "Congratulations! You've won a free vacation to an exotic destination! Click here to claim your prize now!")
result

'Spam with prob: 0.9999936955184914'

In [17]:
result = GaussNC(word_probs, "")
result

'Not spam with prob: 0.9999999906194862'