### data 받아오기, zip풀기

- 모듈 불러오기

In [1]:
import requests
import zipfile
import re
import random
import math
from collections import Counter

-  데이터 다운로드, 압축풀기

In [2]:
r = requests.get("https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip")
with open("spam.zip", "wb") as f:
    f.write(r.content)
#사용방법
zipfile.ZipFile("spam.zip").extractall("./")

- data loading

In [3]:
data = []
with open("SMSSpamCollection.txt", "r") as f:
    for line in f:
        cls, txt = line.strip().split("\t")
        txt.lower()
        bow = set(re.findall("[0-9a-z_]+", txt.lower()))
        data.append([cls, bow])

UnicodeDecodeError: 'cp949' codec can't decode byte 0xe2 in position 2363: illegal multibyte sequence

- 데이터 나누기

In [None]:
random.shuffle(data)
train_size = int(len(data)*0.8)
test_size = len(data) - train_size
train = data[:train_size]
test = data[train_size:]

train_spam = [bow for cls, bow in train if cls == 'spam']
train_ham = [bow for cls, bow in train if cls == 'ham']

### 값 계산

In [None]:
# train 데이터에서 naive bayes 계산을 위한 값들을 미리 다 계산해놓기
prior_spam = math.log(len(train_spam) / len(train))
prior_ham = math.log(len(train_ham) / len(train))

In [None]:
# 단어 w의 spam일 가능도 = P(w|spam) = log((w를 포함하는 스팸수 + a)/ (스팸수 + a))
spam_words = Counter(word for bow in train_spam for word in bow)
ham_words = Counter(word for bow in train_ham for word in bow)

In [None]:
# naive bayes 방법에 따라 spam/ham 점수를 구하여 비교, 라플라스 스무딩 적용, 로그 합산
def predict(bow, spam_words, ham_words, n_spams, n_hams, alpha):
    spam_score = prior_spam + \
        sum( math.log((spam_words[word] + alpha)/(n_spams + alpha)) for word in bow)
    ham_score = prior_ham + \
        sum( math.log((ham_words[word] + alpha)/(n_hams + alpha)) for word in bow)

    if spam_score < ham_score: return 'ham'
    else: return 'spam'

### test

In [None]:
# 잘 맞추는지 확인
tp, tn, fp, fn = 0, 0, 0, 0
for cls, bow in test:
    pred = predict(bow, spam_words, ham_words, len(train_spam), len(train_ham), 1e-4)
    if cls == 'spam' and pred == 'spam': tp += 1
    elif cls == 'ham' and pred == 'ham': tn += 1
    elif cls == 'ham' and pred == 'spam': fp += 1
    else: fn += 1

In [None]:
# 테스트해보기 (confusion matrix)
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print(f' accuracy: {accuracy}\n precision: {precision}\n recall: {recall}\n f1 {f1}')

### END