# Naive Bayes 분류

In [5]:
training_set = [['me free lottery', 1],
                ['free get free you', 1],
                ['you free scholarship', 0],
                ['free to contact me', 0],
                ['you won award', 0],
                ['you ticket lottery', 1]]

In [15]:
# 토큰 빈도수 및 문서별 토큰 수 계산

from collections import defaultdict

# 1 : spam, 0 : normal
doccnt0 = 0
doccnt1 = 0

wordfreq = defaultdict(lambda : [0,0])
for doc, label in training_set :
  w_ls = doc.split()
  for w in w_ls:
    wordfreq[w][label] += 1

for k, (cnt0, cnt1) in wordfreq.items() :
  doccnt0 += cnt0
  doccnt1 += cnt1

In [16]:
wordfreq

defaultdict(<function __main__.<lambda>()>,
            {'me': [1, 1],
             'free': [2, 3],
             'lottery': [0, 2],
             'get': [0, 1],
             'you': [2, 2],
             'scholarship': [1, 0],
             'to': [1, 0],
             'contact': [1, 0],
             'won': [1, 0],
             'award': [1, 0],
             'ticket': [0, 1]})

In [18]:
k = 0.5

wordprobs = defaultdict(lambda:[0,0])

for key, (cnt0, cnt1) in wordfreq.items() :
  wordprobs[key][0] = (cnt0+k) / (doccnt0 + 2*k)
  wordprobs[key][1] = (cnt1+k) / (doccnt1 + 2*k)

In [19]:
wordfreq

defaultdict(<function __main__.<lambda>()>,
            {'me': [1, 1],
             'free': [2, 3],
             'lottery': [0, 2],
             'get': [0, 1],
             'you': [2, 2],
             'scholarship': [1, 0],
             'to': [1, 0],
             'contact': [1, 0],
             'won': [1, 0],
             'award': [1, 0],
             'ticket': [0, 1]})

In [20]:
import math

# 신규 문서 스팸 확률 계산
doc = "free lottery"
tokens = doc.split()

log_prob0 = log_prob1 = 0

for w, (p0, p1) in wordprobs.items():
  if w in tokens:
    log_prob0 += math.log(p0)
    log_prob1 += math.log(p1)

log_prob0 += math.log(doccnt0 / (doccnt0 + doccnt1))
log_prob1 += math.log(doccnt1 / (doccnt0 + doccnt1))

prob0 = math.exp(log_prob0)
prob1 = math.exp(log_prob1)

print('정상 확률 : {}%'.format(prob0/(prob0+prob1)))
print('스팸 확률 : {}%'.format(prob1/(prob0+prob1)))

정상 확률 : 0.12500000000000008%
스팸 확률 : 0.8749999999999999%


In [23]:
log_prob1

-3.3198840257871636

In [24]:
log_prob0

-5.265794174842476

# sklearn 활용
영문 뉴스 분류

In [27]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names)
print(twenty_train.data[0])

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have o

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect',CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())])

text_clf.fit(twenty_train.data, twenty_train.target)

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [34]:
import numpy as np

test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(test.data)
np.mean(predicted == test.target)

0.7738980350504514