In [1]:
import zipfile
import requests
from collections import Counter
import re
import math

# data load

In [2]:
x = requests.get('http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip')
open('spam.zip', 'wb').write(x.content)
zipfile.ZipFile('spam.zip').extractall('./')

In [3]:
with open('SMSSpamCollection.txt', "r") as f:
    data = [line.strip().split('\t') for line in f]

# 데이터 나누기

In [4]:
train_size = int(0.8 * len(data))
test_size = len(data)- train_size

train = data[:train_size]
test = data[train_size:]

# 사전 확률 구하기

In [5]:
count = Counter([ s for s, c in train])

prior_spam = count['spam'] / (count['spam'] + count['ham'])
prior_ham = count['ham'] / (count['spam'] + count['ham'])

In [6]:
print('prior_spam:', prior_spam)
print('prior_ham: ', prior_ham)

prior_spam: 0.13500784929356358
prior_ham:  0.8649921507064364


# Word Count

In [7]:
# 스팸 메시지들에서 각 단어가 등장한 횟수 세기
spam_words = Counter([w.group()\
                    for s, c in train if s == "spam" \
                    for w in re.finditer('[0-9a-z_]+', c.lower())])

# 일반 메시지들에서 각 단어가 등장한 횟수 세기
ham_words = Counter([w.group()\
                    for s, c in train if s == "ham" \
                    for w in re.finditer('[0-9a-z_]+', c.lower())])

In [8]:
#단어 집합 크기 구하기
num_unique_words = len(set(w.group() for s, c in train \
                          for w in re.finditer('[0-9a-z_]+', c.lower())))

# 스팸 메일의 전체 단어수 세기
num_spam_words = sum(spam_words.values())

# 일반 메일의 전체 단어수 세기
num_ham_words = sum(ham_words.values())

# Predict() 함수 정의

- naive bayes 방법에 따라 spam/ham 점수를 구하여 비교
- 라플라스 스무딩 적용
- 로그 합산

In [9]:
def predict(text, k = 0.5):
    words = re.findall('[0-9a-z_]+', text.lower())
    
    #denom_spam = num_spam_words + num_unique_words * k
    #denom_ham = num_ham_words + num_unique_words * k
    
    spam_score = math.log(prior_spam)
    for w in words:
        spam_score += math.log((spam_words[w] + k) / (num_spam_words + num_unique_words *k))
    
    ham_score = math.log(prior_ham)
    for w in words:
        ham_score += math.log((ham_words[w] + k)  / (num_ham_words +  num_unique_words *k))
        
    return spam_score > ham_score

# 테스트 해보기

In [10]:
tp = 0
tn = 0
fp = 0
fn = 0

In [11]:
for s, c in test:
    pred = predict(c)
    
    if pred and s == 'spam':
        tp += 1
    elif pred and s != 'spam':
        fp += 1
    elif not pred and s == 'spam':
        fn += 1
    else:
        tn += 1

In [12]:
print(tp, tn, fp, fn)

140 956 14 5


In [13]:
# Precision, Recall, F1 Score, Accuacy
print('accuracy: ', (tp + tn) / (tp + fp + tn + fn)     )

print('Precision: ', tp / (tp + fp)) 

print('Recall: ', tp / (tp + fn))

print('F1_Score: ',2 /(1/ (tp / (tp + fp))+1/(tp / (tp + fn)))    )

accuracy:  0.9829596412556054
Precision:  0.9090909090909091
Recall:  0.9655172413793104
F1_Score:  0.9364548494983278
