# 1D CNN으로 IMDB 리뷰 분류하기
- https://wikidocs.net/80783

## Import

In [1]:
import numpy as np
import collections

from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

## Load dataset

In [2]:
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=vocab_size)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

## Text preprocessing

In [15]:
collections.Counter(y_train), collections.Counter(y_test)

(Counter({1: 12500, 0: 12500}), Counter({0: 12500, 1: 12500}))

### padding

In [6]:
length_of_text = [len(x) for x in x_train]
print("Length of texts")
print(f"- max: {max(length_of_text)}")
print(f"- mean: {np.mean(length_of_text)}")
print(f"- median: {np.median(length_of_text)}")
print(f"- min: {min(length_of_text)}")

Length of texts
- max: 2494
- mean: 238.71364
- median: 178.0
- min: 11


In [7]:
max_len = 200

x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
x_train.shape, x_test.shape

((25000, 200), (25000, 200))

## Define model

In [18]:
model = Sequential()
model.add(Embedding(vocab_size, 256))
model.add(Dropout(0.3))
model.add(Conv1D(256, 3, padding="valid", activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         2560000   
_________________________________________________________________
dropout (Dropout)            (None, None, 256)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 256)         196864    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1

In [19]:
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

In [20]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["acc"]
)

## Train

In [21]:
history = model.fit(
    x_train,
    y_train,
    epochs=20,
    validation_data=(x_test, y_test),
    callbacks=[es, mc]
)

Train on 25000 samples, validate on 25000 samples
Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.88324, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_acc improved from 0.88324 to 0.88460, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_acc did not improve from 0.88460
Epoch 4/20
Epoch 00004: val_acc improved from 0.88460 to 0.88632, saving model to best_model.h5
Epoch 00004: early stopping


In [22]:
print(f"Accuracy: {model.evaluate(x_test, y_test, verbose=2)[1]}")

25000/1 - 12s - loss: 0.4975 - acc: 0.8863
Accuracy: 0.8863199949264526
