In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(2022)

In [2]:
# Data Load
spam = pd.read_csv("sms_spam.csv")

In [3]:
text = spam["text"]
label = spam["type"]

In [4]:
# Data EDA
text[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [5]:
label[0]

'ham'

In [6]:
label.value_counts()

ham     4827
spam     747
Name: type, dtype: int64

In [7]:
# Data Cleaning
# 정상 메세지는 0, 스팸은 1
label = label.map({"ham": 0, "spam": 1})

In [8]:
label.value_counts()

0    4827
1     747
Name: type, dtype: int64

In [9]:
# text를 문자만 존재하도록 처리
# 영어, 숫자, 띄어쓰기를 제외한 모든 단어를 제거
re_pattern = "[^a-zA-Z0-9\ ]"

In [10]:
text[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [11]:
text.iloc[:1].str.replace(re_pattern, "", regex=True)[0]

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [12]:
text = text.str.replace(re_pattern, "", regex=True)

In [13]:
# 대문자를 모두 소문자로 바꿈
text[0]

'Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amore wat'

In [14]:
text.iloc[:1].str.lower()[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [15]:
text = text.str.lower()

In [16]:
text[0]

'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'

In [17]:
# Data Split
from sklearn.model_selection import train_test_split

train_text, test_text, train_label, test_label = train_test_split(
    text, label, train_size=0.7, random_state=2021
)

In [18]:
print(f"train_data size: {len(train_label)}, {len(train_label)/len(text):.2f}")
print(f"test_data size: {len(test_label)}, {len(test_label)/len(text):.2f}")

train_data size: 3901, 0.70
test_data size: 1673, 0.30


In [19]:
# Count Vectorize
# Word tokenize
import nltk
from nltk import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sclab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [20]:
train_text.iloc[0]

'am only searching for good dual sim mobile pa'

In [21]:
word_tokenize(train_text.iloc[0])

['am', 'only', 'searching', 'for', 'good', 'dual', 'sim', 'mobile', 'pa']

In [22]:
# Count vectorize
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# 예시
train_text.iloc[:2].values

array(['am only searching for good dual sim mobile pa',
       'excellent ill see what rileys plans are'], dtype=object)

In [24]:
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)

In [25]:
cnt_vectorizer.fit(train_text.iloc[:2])



CountVectorizer(tokenizer=<function word_tokenize at 0x0000014633454EE0>)

In [26]:
cnt_vectorizer.vocabulary_

{'am': 0,
 'only': 8,
 'searching': 12,
 'for': 4,
 'good': 5,
 'dual': 2,
 'sim': 14,
 'mobile': 7,
 'pa': 9,
 'excellent': 3,
 'ill': 6,
 'see': 13,
 'what': 15,
 'rileys': 11,
 'plans': 10,
 'are': 1}

In [27]:
vocab = sorted(cnt_vectorizer.vocabulary_.items(), key=lambda x: x[1])
vocab = list(map(lambda x: x[0], vocab))
vocab

['am',
 'are',
 'dual',
 'excellent',
 'for',
 'good',
 'ill',
 'mobile',
 'only',
 'pa',
 'plans',
 'rileys',
 'searching',
 'see',
 'sim',
 'what']

In [28]:
# 문장에서 각 단어장의 단어가 나온 개수
sample_cnt_vector = cnt_vectorizer.transform(train_text.iloc[:2]).toarray()
sample_cnt_vector

array([[1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1]], dtype=int64)

In [29]:
train_text.iloc[:2].values

array(['am only searching for good dual sim mobile pa',
       'excellent ill see what rileys plans are'], dtype=object)

In [30]:
pd.DataFrame(sample_cnt_vector, columns=vocab)

Unnamed: 0,am,are,dual,excellent,for,good,ill,mobile,only,pa,plans,rileys,searching,see,sim,what
0,1,0,1,0,1,1,0,1,1,1,0,0,1,0,1,0
1,0,1,0,1,0,0,1,0,0,0,1,1,0,1,0,1


In [31]:
# 본 데이터 학습
cnt_vectorizer = CountVectorizer(tokenizer=word_tokenize)
cnt_vectorizer.fit(train_text)



CountVectorizer(tokenizer=<function word_tokenize at 0x0000014633454EE0>)

In [32]:
len(cnt_vectorizer.vocabulary_)
# 전체 단어 수

7908

In [33]:
# 예측
train_matrix = cnt_vectorizer.transform(train_text)
test_matrix = cnt_vectorizer.transform(test_text)

In [34]:
cnt_vectorizer.transform(["notavailblewordforcnt"]).toarray().sum()

0

In [35]:
# Naive Bayes
from sklearn.naive_bayes import BernoulliNB

naive_bayes = BernoulliNB()

In [36]:
# 학습
naive_bayes.fit(train_matrix, train_label)

BernoulliNB()

In [37]:
# 예측
train_pred = naive_bayes.predict(train_matrix)
test_pred = naive_bayes.predict(test_matrix)

In [38]:
# 평가
from sklearn.metrics import accuracy_score

train_acc = accuracy_score(train_label, train_pred)
test_acc = accuracy_score(test_label, test_pred)

In [39]:
print(f"Train Accuracy is {train_acc:.4f}")
print(f"Test Accuracy is {test_acc:.4f}")

Train Accuracy is 0.9854
Test Accuracy is 0.9767
