# layer

## GlobalMaxPooling1D

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import MaxPooling1D,GlobalMaxPooling1D
x = tf.constant([[1., 2., 3.],
                 [4., 5., 6.],
                 [7., 8., 9.]])
print(x.shape)
print(x)

(3, 3)
tf.Tensor(
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]], shape=(3, 3), dtype=float32)


In [2]:
x = tf.reshape(x, [3, 3, 1])
print(x.shape)
print(x)

(3, 3, 1)
tf.Tensor(
[[[1.]
  [2.]
  [3.]]

 [[4.]
  [5.]
  [6.]]

 [[7.]
  [8.]
  [9.]]], shape=(3, 3, 1), dtype=float32)


In [3]:
max_pool_1d = GlobalMaxPooling1D()
max_pool_1d(x)

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[3.],
       [6.],
       [9.]], dtype=float32)>

## MaxPooling1D

In [4]:
x = tf.constant([1., 2., 3., 4., 5.])
x = tf.reshape(x, [1, 5, 1])
print(x)

tf.Tensor(
[[[1.]
  [2.]
  [3.]
  [4.]
  [5.]]], shape=(1, 5, 1), dtype=float32)


In [5]:
max_pool_1d = MaxPooling1D(pool_size=2,
    strides=1, padding='valid')
max_pool_1d(x)

<tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
array([[[2.],
        [3.],
        [4.],
        [5.]]], dtype=float32)>

# Modeling

In [6]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import json
# 개인의 구글 드라이브 폴더 path를 넣어줘야 합니다.
dir_path = '/content/drive/MyDrive/lecture/진행 중/서울산업진흥원(SBA)/02. 딥러닝/05. NLP/data'
os.chdir(dir_path) # 현재 경로 수정

# 전처리 & 벡터화가 완료된 데이터
X_train = pd.read_pickle(f'{dir_path}/tf_train_inputs.pkl')
X_text = pd.read_pickle(f'{dir_path}/tf_test_inputs.pkl')
y_train = pd.read_pickle(f'{dir_path}/tf_train_labels.pkl')
y_test = pd.read_pickle(f'{dir_path}/tf_test_labels.pkl')

Mounted at /content/drive


In [7]:
config = json.load(open(f'{dir_path}/config.json', 'r'))
vocab = config['vocab']
vocab_size = config['vocab_size']

In [8]:
from keras.api._v2.keras import activations
from tensorflow.keras.layers import Embedding, Dense, Conv1D, Dropout, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

embedding_dim = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.3))
model.add(Conv1D(32, 5, padding = 'valid', activation = 'relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 4)
mc = ModelCheckpoint('best_1dcnn.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
history = model.fit(X_train, y_train,
                    epochs = 30, callbacks = [es, mc],
                    batch_size = 64,validation_split = 0.2)

Epoch 1/30
Epoch 1: val_acc improved from -inf to 0.81136, saving model to best_1dcnn.h5
Epoch 2/30
Epoch 2: val_acc improved from 0.81136 to 0.81723, saving model to best_1dcnn.h5
Epoch 3/30
Epoch 3: val_acc improved from 0.81723 to 0.81986, saving model to best_1dcnn.h5
Epoch 4/30
Epoch 4: val_acc improved from 0.81986 to 0.82006, saving model to best_1dcnn.h5
Epoch 5/30
Epoch 5: val_acc improved from 0.82006 to 0.82116, saving model to best_1dcnn.h5
Epoch 6/30
Epoch 6: val_acc did not improve from 0.82116
Epoch 7/30
Epoch 7: val_acc improved from 0.82116 to 0.82143, saving model to best_1dcnn.h5
Epoch 8/30
Epoch 8: val_acc did not improve from 0.82143
Epoch 9/30
Epoch 9: val_acc did not improve from 0.82143
Epoch 9: early stopping


In [9]:
import pickle
# load the tokenizer from a file
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [10]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.4.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.6/465.6 KB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [11]:
load_model = load_model(f'{dir_path}/best_1dcnn.h5')

In [12]:
# text를 넣어 확률을 활용하여 리뷰를 예측하는 함수
text = '진짜 너무 재밌다.'
from tqdm import tqdm
from konlpy.tag import Okt
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def predict_text(text):
  # 데이터 전처리
  stopwords = ['의', '를', '와', '과', '으로', '것', '잘', '되', '주', '보', '수',
              '좀', '이', '있', '은', '한', '아', '도', '에', '하', '는', '자',
              '하다', '가', '들', '걍', '등']

  text = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]','', text)

  okt = Okt()
  tokenized_data = []

  tokenized_sentence = okt.morphs(text, stem = True)
  stopwords_removed_sentences = [token for token in tokenized_sentence if not token in stopwords]
  tokenized_data.append(stopwords_removed_sentences)

  # 벡터화
  sequences = tokenizer.texts_to_sequences(tokenized_data)

  SEQUENCE_MAX_LEN = 8
  inputs = pad_sequences(sequences , maxlen = SEQUENCE_MAX_LEN, padding = 'post')
  score = load_model.predict(inputs)[0]
  if score > 0.7:
    print(f'{round(score[0] * 100, 2)}% 긍정 리뷰 입니다.')
  else:
    print(f'{round(score[0] * 100, 2)}% 부정 리뷰 입니다.')
  return score


In [13]:
predict_text('와 진짜 꿀잼')

97.6% 긍정 리뷰 입니다.


array([0.9760081], dtype=float32)