<a href="https://colab.research.google.com/github/0jipy/192kbps_colab_JIPY/blob/main/52_IMDB%EA%B0%90%EC%84%B1%EB%B6%84%EC%84%9D_Conv1D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMDB 영화 리뷰 감성 분석 - Conv1D

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
import warnings
warnings.filterwarnings('ignore')

### Conv1D로 IMDB 리뷰 감성 분석
    단어 빈도수: 10,000 (총 88,584)
    문장의 단어수: 500 (최대 2,494)
    test data중 40%(10,000개)는 검증용으로

In [None]:
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X_train.shape, X_test.shape, y_train.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,), (25000,))

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
X_train.shape, X_test.shape, y_train.shape

((25000,), (25000,), (25000,))

In [None]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_train.shape, X_test.shape

((25000, 500), (25000, 500))

In [None]:
from sklearn.model_selection import train_test_split
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed
)
X_test.shape, X_valid.shape, y_test.shape, y_valid.shape

((15000, 500), (15000,), (10000, 500), (10000,))

- Case 1) Conv1D x 2, MaxPooling1D x 2, Dropout, GlobalMaxPooling1D
    - embedding dim: 100

In [None]:
model1 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 7, activation='relu'),
    MaxPooling1D(7),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),    
    Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 dropout (Dropout)           (None, 500, 100)          0         
                                                                 
 conv1d (Conv1D)             (None, 494, 64)           44864     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 70, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            20544     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 13, 64)           0         
 1D)                                                    

In [None]:
model1.compile('adam', 'binary_crossentropy',['accuracy'])

model_path = 'best-imdb-conv1d.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [None]:
hist1 = model1.fit(
    X_train, y_train, epochs=30, validation_split=0.2, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc, es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.35214, saving model to best-imdb-conv1d.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.35214 to 0.32490, saving model to best-imdb-conv1d.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.32490
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.32490
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.32490
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.32490
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.32490


In [None]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.33736398816108704, 0.8513333201408386]

- case2 ) Conv1D + LSTM

In [None]:
from tensorflow.keras.layers import LSTM

In [None]:
model2 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    LSTM(100),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_3 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_4 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 lstm_2 (LSTM)               (None, 100)               66000     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                      

In [None]:
model2.compile('adam', 'binary_crossentropy',['accuracy'])

model_path = 'best-imdb-conv1d-lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [None]:
hist2 = model2.fit(
    X_train, y_train, epochs=30, validation_split=0.2, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc, es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.30810, saving model to best-imdb-conv1d-lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.30810 to 0.26381, saving model to best-imdb-conv1d-lstm.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.26381
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.26381
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.26381
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.26381
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.26381


In [None]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.278827428817749, 0.8855999708175659]

- case 3) 은닉층을 하나 더 두면 어떨까?
- Conv1D + Dense

In [None]:
model3 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    GlobalMaxPooling1D(),
    Dense(100, activation='relu'),
    Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_4 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_5 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_4 (Dense)             (None, 100)              

In [None]:
model3.compile('adam', 'binary_crossentropy',['accuracy'])

model_path = 'best-imdb-conv1d-fcn.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [None]:
hist3 = model3.fit(
    X_train, y_train, epochs=30, validation_split=0.2, batch_size=128,
    validation_data=[X_valid, y_valid],
    callbacks=[mc, es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.41544, saving model to best-imdb-conv1d-fcn.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.41544 to 0.31502, saving model to best-imdb-conv1d-fcn.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.31502 to 0.27834, saving model to best-imdb-conv1d-fcn.h5
Epoch 4/30
Epoch 00004: val_loss improved from 0.27834 to 0.26838, saving model to best-imdb-conv1d-fcn.h5
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.26838
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.26838
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.26838
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.26838
Epoch 9/30
Epoch 00009: val_loss did not improve from 0.26838


In [None]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.2854938209056854, 0.8852666616439819]