# 나이브 베이즈 분류기 (NB)

스팸인가 아닌가?

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
# 데이터 준비
df = pd.read_csv('data-files/SMSSpamCollection.tsv', sep = "\t", header = None, 
                 names=['target', 'message'])
# sep = "\t" 구분자는 탭이다, header = None 헤더 없다. names=['target', 'message'] 컬럼명으로 추가
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# 범주형 -> 수치형(순위형/명목형 결정)
# y라서 원핫인코딩 안되고 레이블인코딩
from sklearn.preprocessing import LabelEncoder

df['label'] = LabelEncoder().fit_transform(df['target']) # [] 1차원, [[]] 2차원
df.head()

Unnamed: 0,target,message,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
# message 전처리 1 (영숫자, 공백을 제외한 나머지 문자는 제거)

import re   # 정규표현식 도구
df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))
# "[]" 목록을 말함. \w - 영숫자\s - 공백, ^\w\s > 영숫자, 공백을 제외한 나머지 모두
df.head(1)

  df['message2'] = df['message'].map(lambda v: re.sub("[^\w\s]", '', v))


Unnamed: 0,target,message,label,message2
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...


In [5]:
# message 전처리 2 : 모두 소문자로 변경

df['message3'] = df['message2'].str.lower()
# 스팸인지 아닌지에 대소문자 중요하지 않아서 모든 영문자를 소문자로 바꾸기
df.head(3)

Unnamed: 0,target,message,label,message2,message3
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...


In [None]:
# !pip install nltk
# Natural Language tokenizing

Defaulting to user installation because normal site-packages is not writeable


In [9]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\human\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
# message 전처리 3 : 문장을 토큰의 리스트로 변환
# df['message4'] = df['message3'].map(lambda v: nltk.word_tokenize(v))
df['message4'] = df['message3'].map(nltk.word_tokenize) 
# word_tokenize : 단어별로 구분하라(스페이스나 탭을 기준으로 단어를 나눔)
df.head()

Unnamed: 0,target,message,label,message2,message3,message4
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [22]:
# message 전처리 4 : 어근 추출 (단어의 활용 표준화)
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('apples'), stemmer.stem('apple'))
print(stemmer.stem('go'), stemmer.stem('goes'))

df['message5'] = df['message4'].map(lambda words: [ stemmer.stem(w) for w in words])
df.head()

appl appl
go goe


Unnamed: 0,target,message,label,message2,message3,message4,message5
0,ham,"Go until jurong point, crazy.. Available only ...",0,Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, until, jurong, point, crazi, avail, onli,..."
1,ham,Ok lar... Joking wif u oni...,0,Ok lar Joking wif u oni,ok lar joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,0,U dun say so early hor U c already then say,u dun say so early hor u c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, i, dont, think, he, goe, to, usf, he, li..."
