In [1]:
from tensorflow.keras.utils import to_categorical
import pandas as pd

In [2]:
data = pd.read_csv("./train_final.csv")
X_train_txt = data["Sentence"]
y_train_num = data["Category"]

In [3]:
X_train_txt = pd.Series.tolist(X_train_txt)
y_train_num = pd.Series.tolist(y_train_num)

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [5]:
t = Tokenizer()
t.fit_on_texts(X_train_txt)
vocab_size = len(t.word_index) + 1

print(vocab_size)

15338


In [6]:
X_encoded = t.texts_to_sequences(X_train_txt)
print(X_encoded)

[[45, 1, 13, 46, 3351, 1, 2936, 4, 683, 10, 139, 2, 1966, 684, 608, 994, 9, 26, 1, 189, 8, 7, 437, 5, 10660, 456, 1, 13, 6, 32, 19, 867, 477, 7134, 42, 898, 2144], [10661, 10662, 5643], [8, 6, 141, 344, 443], [12, 30, 2363, 8, 7, 2, 16, 9, 225, 322, 76, 1049], [8, 7, 80, 97, 14, 18, 2634, 175, 10663, 1472, 9, 241, 186, 27, 35, 12, 548], [67, 518, 5644, 237, 2635, 2, 5645, 9, 1, 5646, 74, 609, 37, 1967], [485, 1573, 1811, 6, 2, 722, 61, 9, 7135, 1, 536, 3, 7136, 1, 72, 2364], [1, 16, 6, 49, 2937, 14, 19, 108, 3890, 1968, 5, 84, 51, 58, 284, 951], [45, 2365, 46, 52, 22, 39, 58, 2636, 10, 37, 149, 11, 7137, 8], [10664, 143], [49, 565, 3, 10665, 2637, 5, 21, 494, 7138], [2938, 7, 367, 27, 1, 3352, 3, 2939, 4, 18, 345, 68, 1367, 1162, 16], [11, 1, 10666, 5647, 3891, 287, 40, 1969], [304, 326, 3, 995, 53, 25, 4, 1, 10667, 1368, 2, 801, 63, 181, 103], [899, 12, 2, 212, 4, 143], [96, 2940, 5, 1, 660, 29, 1473, 900, 7139, 1574, 40, 623, 519, 2941, 4, 216, 61, 6, 2, 4576, 361, 2638, 4, 412, 685,

In [7]:
max_len=max(len(l) for l in X_encoded)
print(max_len)

49


In [8]:
print('리뷰의 최대 길이 : {}'.format(max(len(l) for l in X_encoded)))
print('리뷰의 평균 길이 : {}'.format(sum(map(len, X_encoded))/len(X_encoded)))

리뷰의 최대 길이 : 49
리뷰의 평균 길이 : 17.41398128898129


In [9]:
X_train=pad_sequences(X_encoded, maxlen=int(max_len/2+6), padding='post')
num = np.unique(y_train_num, axis=0)
num = num.shape[0]
y_train = np.eye(num)[y_train_num]
print(X_train.shape)
print(y_train.shape)
print(X_train)
print(y_train)

(11544, 30)
(11544, 5)
[[    4   683    10 ...    42   898  2144]
 [10661 10662  5643 ...     0     0     0]
 [    8     6   141 ...     0     0     0]
 ...
 [   30    35   136 ...     0     0     0]
 [  114    50    31 ...     0     0     0]
 [   18   682    16 ...   226   105     0]]
[[0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [10]:
int(max_len/2+6)

30

In [11]:
embedding_dict = dict()
f = open('glove.6B.100d.txt', encoding="utf8")

for line in f:
    word_vector = line.split()
    word = word_vector[0]
    word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 100개의 값을 가지는 array로 변환
    embedding_dict[word] = word_vector_arr
f.close()
print('%s개의 Embedding vector가 있습니다.' % len(embedding_dict))

400000개의 Embedding vector가 있습니다.


In [12]:
embedding_matrix = np.zeros((vocab_size, 100))
# 단어 집합 크기의 행과 100개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)

(15338, 100)

In [13]:
print(t.word_index.items())



In [14]:
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어를 1개씩 꺼내온다.
    temp = embedding_dict.get(word) # 단어(key) 해당되는 임베딩 벡터의 100개의 값(value)를 임시 변수에 저장
    if temp is not None:
        embedding_matrix[i] = temp # 임수 변수의 값을 단어와 맵핑되는 인덱스의 행에 삽입

In [15]:
import tensorflow as tf

In [16]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, values, query): # 단, key와 value는 같음
        # query shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
        hidden_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [17]:
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers
import os

In [81]:
sequence_input = Input(shape=(int(max_len/2+6),), dtype='int32')
embedded_sequences = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False)(sequence_input)
# embedded_sequences = Embedding(vocab_size, 100, input_length=int(max_len/2+6), mask_zero = True)(sequence_input)

In [82]:
lstm = Bidirectional(LSTM(50, dropout=0.5, return_sequences = True))(embedded_sequences)

In [83]:
lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(50, dropout=0.5, return_sequences=True, return_state=True))(lstm)

In [84]:
print(lstm.shape, forward_h.shape, forward_c.shape, backward_h.shape, backward_c.shape)

(None, 30, 100) (None, 50) (None, 50) (None, 50) (None, 50)


In [85]:
state_h = Concatenate()([forward_h, backward_h]) # 은닉 상태
state_c = Concatenate()([forward_c, backward_c]) # 셀 상태

In [86]:
attention = BahdanauAttention(50) # 가중치 크기 정의
context_vector, attention_weights = attention(lstm, state_h)

In [87]:
dense1 = Dense(20, activation="relu")(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(5, activation="sigmoid")(dropout)
model = Model(inputs=sequence_input, outputs=output)

In [88]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [89]:
history = model.fit(X_train, y_train, epochs = 3, batch_size = 100, verbose=1)

Epoch 1/3


UnknownError:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[model_7/bidirectional_13/forward_lstm_13/PartitionedCall]] [Op:__inference_train_function_235677]

Function call stack:
train_function -> train_function -> train_function
