#### 모듈

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Flatten

#### 데이터 불러오기

In [2]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('./data/spam.csv', encoding='latin1')
spam_data.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


#### 불필요한 결측치 제거

In [3]:
spam_data = spam_data.dropna(axis=1)
spam_data.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam_data.columns = ['label', 'content']
spam_data.head(5)

Unnamed: 0,label,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
spam_data['label'] = spam_data['label'].replace('ham', 1)
spam_data['label'] = spam_data['label'].replace('spam', 0)
spam_data.head(5)

Unnamed: 0,label,content
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


#### 정규화를 통해 문자 외 처리

In [7]:
spam_data['content'] = spam_data['content'].str.replace("[^\w]", " ")
spam_data.head(5)

Unnamed: 0,label,content
0,1,Go until jurong point crazy Available only ...
1,1,Ok lar Joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor U c already then say
4,1,Nah I don t think he goes to usf he lives aro...


#### 빈공백 처리

In [8]:
spam_data['label'] = spam_data['label'].replace("", np.nan)
spam_data['content'] = spam_data['content'].replace("", np.nan)
spam_data = spam_data.dropna(how='any')

#### train / test

In [9]:
from sklearn.model_selection import train_test_split

content_train, content_test, y_train, y_test = train_test_split(spam_data['content'], spam_data['label'])
content_train.shape, content_test.shape, y_train.shape, y_test.shape

((4179,), (1393,), (4179,), (1393,))

#### stopwords

In [10]:
stopwords = ['a', 'an']

X_train = []
for stc in content_train:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_train.append(token)
    
X_test = []
for stc in content_test:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X_test.append(token)

#### 정수화

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(4000)
tokenizer.fit_on_texts(content_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [26]:
X_train

array([[   0,    0,    0, ...,   64,   63,   64],
       [   0,    0,    0, ..., 3616,  212, 3617],
       [   0,    0,    0, ...,  175,  251,   79],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,  198,   76,  111],
       [   0,    0,    0, ...,   52,   85,  214]])

In [12]:
low_count = 0

for word, word_count in tokenizer.word_counts.items():
    if word_count > 1:
        low_count += 1
print(len(tokenizer.word_index))
print(low_count)

7510
3615


In [13]:
max_length = 0

for data in X_train:
    if max_length < len(data):
        max_length = len(data)
print(max_length)

182


#### 패딩을 위한 사이즈 탐색
- 최대길이 : 최대길이에 맞추게 되면 데이터 손실은 없지만 코드상 비효율적이게 된다.
- 평균길이 : 데이터 손실은 조금 있겠지만 효율은 올라간다.

In [14]:
max_len = 179
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

In [15]:
X_train

array([[   0,    0,    0, ...,   64,   63,   64],
       [   0,    0,    0, ..., 3616,  212, 3617],
       [   0,    0,    0, ...,  175,  251,   79],
       ...,
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,  198,   76,  111],
       [   0,    0,    0, ...,   52,   85,  214]])

In [25]:
X_train[0].shape

(182,)

In [17]:
X_train.shape

(4179, 182)

In [64]:
X_train = np.asarray(X_train)
X_test = np.asarray(X_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

#### CNN

In [65]:
model = Sequential()
model.add(Embedding(4000, 32))
model.add(Conv1D(256, 3, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrix=['acc'])
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4, batch_size=64)

Train on 4179 samples, validate on 1393 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x20cc756b288>