In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
import pandas as pd
import numpy as np

mail = pd.read_csv('./data/spam.csv', encoding='latin1')
mail.head(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
mail = mail.dropna(axis=1)
mail.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
mail['v1'] = mail['v1'].replace('ham', 1)
mail['v1'] = mail['v1'].replace('spam', 0)
mail.head(5)

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
mail['v2'] = mail['v2'].str.replace("[^\w]|br", " ")
mail.head(5)

Unnamed: 0,v1,v2
0,1,Go until jurong point crazy Available only ...
1,1,Ok lar Joking wif u oni
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor U c already then say
4,1,Nah I don t think he goes to usf he lives aro...


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(mail['v2'], mail['v1'], test_size = 0.25, shuffle=True, random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4179,), (1393,), (4179,), (1393,))

In [7]:
X_train

5398    Hi  Hope you had a good day  Have a better night 
3428    I don t know jack shit about anything or i d s...
1968    2 laptop    I noe infra but too slow lar    I ...
334     Valentines Day Special  Win over å 1000 in our...
5550          Cool  what time you think you can get here 
                              ...                        
789                          Gud mrng dear hav a nice day
968             Are you willing to go for aptitude class 
1667    So now my dad is gonna call after he gets out ...
3321    Ok darlin i supose it was ok i just worry too ...
1688                     Nan sonathaya soladha  Why boss 
Name: v2, Length: 4179, dtype: object

In [8]:
stopwords = ['a', 'an']

X1_train = []
for stc in X_train:
    token = []
    words = stc.split()
    for word in words:
        if word not in stopwords:
            token.append(word)
    X1_train.append(token)
    
X1_test = []
for stc in X_test:
    token = []
    words = stc.split()
    for word in word:
        if word not in stopwords:
            token.append(word)
    X1_test.append(token)    

#### 정수인코딩

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

# 빈도수가 높은것부터 4200개는 넣겠다.
tokenizer = Tokenizer(4200)
tokenizer.fit_on_texts(X1_train)

X1_train = tokenizer.texts_to_sequences(X1_train)
X1_test = tokenizer.texts_to_sequences(X1_test)

#### 전체 단어의 개수

In [10]:
print(len(tokenizer.word_index))

7517


#### 빈도수가 2이상인 단어들이 몇개인지 찾는다

In [11]:
low_count = 0
for word, word_count in tokenizer.word_counts.items():
    if word_count >= 2:
        low_count += 1
print(low_count)

5601


In [12]:
print(X1_train[:2])

[[112, 126, 2, 140, 59, 70, 18, 304, 127], [1, 80, 23, 57, 3577, 384, 83, 180, 29, 1, 103, 147, 152, 194, 3578, 25, 36, 2, 72, 2, 26, 3579, 17, 1, 113, 6, 40, 78, 9, 703, 7, 1917, 3, 4, 3580]]


#### 패딩을 위한 사이즈 탐색
- 최대길이 : 최대길이에 맞추게 되면 데이터 손실은 없지만 코드상 비효율적이게 된다.
- 평균길이 : 데이터 손실은 조금 있겠지만 효율은 올라간다.

In [13]:
max_length = 0
for data in X1_train:
    if max_length < len(data):
        max_length = len(data)
print(max_length)

181


In [14]:
max_len = 181
X1_train = pad_sequences(X1_train, maxlen=max_len)
X1_test = pad_sequences(X1_test, maxlen=max_len)

#### 넘파이배열로 넣어라 해서 추가함

In [15]:
X1_train = np.asarray(X1_train)
X1_test = np.asarray(X1_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

#### SimpleRNN 

In [None]:
model = Sequential()
model.add(Embedding(4200, 32)) 
model.add(SimpleRNN(32)) 
model.add(Dense(1, activation='sigmoid')) 

#### LSTM

In [16]:
model = Sequential()
model.add(Embedding(4200, 120)) 
model.add(LSTM(120)) 
model.add(Dense(1, activation='sigmoid')) 

early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) 
# model_check = ModelCheckpoint('the_best.h5',monitor='val_acc', mode='max', verbose=1, save_best_only=True) 

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
#  model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X1_train, y_train, validation_data=(X1_test, y_test), epochs=10, batch_size=64, callbacks=[early_stop])

Train on 4179 samples, validate on 1393 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1fee489c048>