In [46]:
# data 받아오기, zip 풀기
import requests
import zipfile
import pandas as pd
import re
import random

In [47]:
# data download, unzip

r = requests.get("https://www.dt.fee.unicamp.br/~tiago/smsspamcollection/smsspamcollection.zip")
with open("sms.zip","wb") as f:
  f.write(r.content)
zipfile.ZipFile("sms.zip").extractall("./")

In [48]:
# loading data

data = []
with open("SMSSpamCollection.txt", "r") as f:
  for line in f:
    cls, txt = line.strip().split("\t")
    bow = set(re.findall("[0-9a-z_]+", txt.lower()))
    data.append([cls,bow])

In [49]:
# split data into train and test
random.shuffle(data)
train_size = int(len(data) * 0.8)
test_size = len(data) - train_size
train = data[:train_size]
test = data[train_size:]
print(train_size, test_size)

4459 1115


In [50]:
# train 데이터에서, naive bayes 계산을 위한 값들을 미리 다 계산해놓기
# prior prob, P(spam), P(ham)
# P(spam) = spam 문자 수 / 전체 문자 수
"""
  라플라스 스무딩 : 관측한 메일에서 한번도 등장하지 않은 단어가 있으면,
  예측할 때 결과가 좀 이상해지는 문제
  가상의 spam문자, ham문자를 하나씩 추가. 추가한 spam, ham 문자에는 세상 존재하는 모든 단어가 있음
  가상의 spam문자, ham문자의 영향력이 지나치면 곤란하기 떄문에 가중치를 줄 수 있다. alpha
  시공 폭풍 ㄱㄱ -> P(ㄱㄱ|sapm) = 0
  P(spam) * P(시공|spam) * P(폭풍|spam) * P(ㄱㄱ|spam)

"""

alpha = 0.5

n_total = train_size
n_spam = sum(1 for cls, bow in train if cls == 'spam')
n_ham = sum(1 for cls, bow in train if cls == 'ham')

prior_spam = (n_spam+alpha) / (n_total+2*alpha)
prior_ham = (n_ham+alpha) / (n_total+2*alpha)

print(prior_spam, prior_ham)

0.13329596412556055 0.8667040358744394


In [51]:
# 모든 단어의 likelihood, P(W|spam), P(W|ham)
# P(w|spam) = w를 포함하는 spam 문자 수 + alpha / 전체 spam 문자 수 + alpha
# P(w|ham = w를 포함하는 ham 문자 수 + alpha / 전체 ham 문자 수 + alpha

# dic
# spam_words = {}
# ham_words = {}
# for cls, bow in train :
#   for word in bow : 
#     if cls == "spam" :
#       if word not in spam_words :
#         spam_words[word] = 0
#       spam_words[word] += 1
#     else :
#       if word not in ham_words :
#         ham_words[word] = 0
#       ham_words[word] += 1

# counter 활용
from collections import Counter

spam_words = {}
ham_words = {}

spam_words = Counter(word for cls, bow in train
 for word in bow 
 if cls =="spam")

ham_words = Counter(word for cls, bow in train
 for word in bow 
 if cls =="ham")



In [52]:
def predict(bow) :
  # spam_scroe = P(spam) * P(w1 | spam) * P(w2 | spam)....
  # ham_score = P(ham) * P(w1 | ham) * P(w2 | ham)....

  spam_score = prior_spam
  ham_score = prior_ham

  for word in bow :
    spam_score *= (spam_words[word] + alpha)/(n_spam + alpha)
    ham_score *= (ham_words[word] + alpha)/(n_ham + alpha)

  if spam_score < ham_score :
    return "ham"
  else :
    return "spam"


In [53]:
tp, tn, fp, fn = 0,0,0,0

for ans, bow in test :
  pred = predict(bow)
  if pred == 'spam' :
    if ans == "spam" :
      tp += 1
    else :
      fp += 1
  else :
    if ans == "ham" :
      tn += 1
    else :
      fn += 1
    
print(f"tp: {tp}, tn: {tn}, fp: {fp}, fn: {fn}")


tp: 153, tn: 802, fp: 160, fn: 0


In [55]:
# accuracy = 맞힌 수 / 전체 테스트 수

acc = (tp + tn) / (tp + tn + fp + fn)

# precision = 스펨이야 : 중 실제 스팸인 경우

prec = tp / (tp + fp)

# recall = 실제 스팸중에 그중 몇개나 찾아 냈는지

recall = tp / (tp + fn)

# f1 = precision과 recall 의 조화 평균

f1 = 2 * prec * recall / (prec + recall)

print(f"acc: {acc:.3f}, prec: {prec:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")


acc: 0.857, prec: 0.489, recall: 1.000, f1: 0.657
