In [4]:
# SMS Spam Collection Dataset
# 문자 메세지 데이터
# spam과 ham으로 분류

# 데이터 준비
!wget https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
!unzip sms+spam+collection.zip

--2024-12-02 01:58:54--  https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘sms+spam+collection.zip’

sms+spam+collection     [  <=>               ] 198.65K   650KB/s    in 0.3s    

2024-12-02 01:58:55 (650 KB/s) - ‘sms+spam+collection.zip’ saved [203415]

Archive:  sms+spam+collection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [7]:
# 데이터 전처리
# SMS을 Bag-of-Words 형태로 변환
# train, test 데이터 분리
import re
import random

data = [line.strip().split("\t") for line in open("SMSSpamCollection", "r")]
for d in data:
  d[1] = set(re.findall("[a-z0-1_]+", d[1].lower()))

random.shuffle(data)

train_len = int(len(data) * 0.8)
test_len = len(data) - train_len

train = data[:train_len]
test = data[train_len:]

In [8]:
# 데이터 나누기
# Train 데이터에서 spam과 ham 구분하기

train_spam = [bow for cls, bow in train if cls == 'spam']
train_ham = [bow for cls, bow in train if cls == 'ham']

In [9]:
# 사전확률 구하기
# 스팸 메시지와 일반 메시지의 개수를 세어 사전확률 구하기(로그 적용)
import math

prior_spam = math.log(len(train_spam) / len(train))
prior_ham = math.log(len(train_ham) / len(train))

In [10]:
# word count
# 각 단어별로 몇 개의 스팸 메세지와 일반 메세지에 등장했는지 세기
# 단어 w의 spam일 가능도 = p(w|spam) = log((w를 포함하는 스팸수 + a) / (스팸수 + a))
from collections import Counter

spam_words = Counter(word for bow in train_spam for word in bow)
ham_words = Counter(word for bow in train_ham for word in bow)

In [11]:
# Predict()
# naive bayes 방법에 따라 spam/ham 점수를 구하여 비교
# 라플라스 스무딩 적용
# 로그 합산

def predict(bow, spam_words, ham_words, n_spams, n_hams, alpha):
  spam_score = prior_spam + sum(math.log((spam_words[word] + alpha) / (n_spams + alpha)) for word in bow)
  ham_score = prior_ham + sum(math.log((ham_words[word] + alpha) / (n_hams + alpha)) for word in bow)

  if spam_score <ham_score:
    return 'ham'
  else:
    return 'spam'

In [13]:
# 테스트 해보기

tp, tn, fp, fn = 0, 0, 0, 0

for cls, bow in test:
  pred = predict(bow, spam_words, ham_words, len(train_spam), len(train_ham), 1e-4)
  if cls == 'spam' and pred == 'spam': tp += 1
  elif cls == 'ham' and pred == 'ham': tn += 1
  elif cls == 'ham' and pred == 'spam': fp += 1
  else: fn += 1

# precision, recall, f1 score, accuracy

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)

print(accuracy, precision, recall, f1)

0.9757847533632287 0.8620689655172413 0.946969696969697 0.9025270758122743
