In [12]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras import preprocessing
from keras.layers import Input,Embedding,Dense, Dropout, convolutional,Conv1D, GlobalMaxPooling1D, concatenate

In [None]:
# CNN -> 합성곱 신경망 챗봇 데이터 훈련
# 라벨 0:일상, 1:이별, 2:사랑

In [3]:
data = pd.read_csv("./data/chatbot_data.csv", delimiter=",")
feature = data['Q'].to_list()
label = data['label'].to_list()

In [7]:
# 단어 토크나이징
# 단어 시퀸스 : 단어를 토큰화해서 순서대로 리스트에 담는 것

corpus = [preprocessing.text.text_to_word_sequence(text) for text in feature]
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)

suqunce = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index

# 패딩 이용해서 입력데이터 길이 맞추기
pad_seq = keras.utils.pad_sequences(suqunce, maxlen=15, padding='post')

In [9]:
# 훈련세트, 테스트세트, 검증세트

ds = tf.data.Dataset.from_tensor_slices((pad_seq, label))
ds = ds.shuffle(len(feature))

train_size = int(len(pad_seq)*0.7)
val_size = int(len(pad_seq)*0.2)
test_size = int(len(pad_seq)*0.1)

# trai_ds -> input, target, batch
train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

In [10]:
# 모델

# 하이퍼 파라미터
EMB_SIZE = 128 # 임베딩 벡터의 길이
EPOCH = 5
VOCA_SIZE = len(word_index) + 1 # 전체 단어의 갯수

In [None]:
# CNN 함수형으로

In [11]:
# input layer
input_layer = Input(shape=(15,))

# embedding layer
embedding_layer = Embedding(VOCA_SIZE, EMB_SIZE, input_length=15)(input_layer)

# dropout layer
dropout_emb = Dropout(rate=0.5)(embedding_layer) 

In [13]:
# 합성곱(3-gram, 4-gram, 5-gram)
conv1 = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu')(dropout_emb)
pool1 = GlobalMaxPooling1D()(conv1)

conv2 = Conv1D(filters=128, kernel_size=4, padding='valid', activation='relu')(dropout_emb)
pool2 = GlobalMaxPooling1D()(conv2)

conv3 = Conv1D(filters=128, kernel_size=5, padding='valid', activation='relu')(dropout_emb)
pool3 = GlobalMaxPooling1D()(conv3)

concat = concatenate([pool1, pool2, pool3])

In [14]:
# 밀집층
hidden = Dense(128, activation='relu')(concat)

# 드롭아웃
dropout_hidden= Dropout(rate= 0.5)(hidden)

# 밀집층
pred = Dense(3, activation='softmax')(dropout_hidden)

In [15]:
model = keras.models.Model(input_layer, pred)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 128)      1715072     ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 15, 128)      0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 13, 128)      49280       ['dropout[0][0]']                
                                                                                              

In [27]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='accuracy')

In [28]:
model.fit(train_ds, epochs=EPOCH, validation_data=val_ds, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2266ec5b670>

In [29]:
# 모델 평가
model.evaluate(test_ds, verbose=1)



[0.06201406195759773, 0.9830795526504517]

In [30]:
model.save('cnn.h5')

In [31]:
# 감정 분류
# 테스트용 데이터셋
ds = tf.data.Dataset.from_tensor_slices((pad_seq, label))
ds = ds.shuffle(len(feature))
test_ds = ds.take(2000).batch(20)

In [32]:
model = keras.models.load_model('cnn.h5')
model.evaluate(test_ds)



[0.06409832835197449, 0.9815000295639038]

In [33]:
# 실제 데이터셋
# 답러닝 모델을 거친 예측값 비교

# 단어 시퀀스
corpus[11238]

['좋아하는', '여자의', '단점을', '들었는데', '어떻게', '해야할지', '모르겠어', '답답해']

In [22]:
# 단어 인덱스 시퀀스
pad_seq[11238]

array([    2,  3970,  4570, 12909,    12,   794,    71,   631,     0,
           0,     0,     0,     0,     0,     0])

In [23]:
label[11238]

2

In [35]:
pred = model.predict(pad_seq)
pred[11238]

# softmax
# 2번 클래스일 확률 0.9990



array([3.6500855e-06, 4.3452792e-06, 9.9999201e-01], dtype=float32)