# LSTM을 이용한 감성분석
---
RNN(순환신경망)의 대표 모형인 LSTM을 이용한 아마존 리뷰 감성 분석

### 데이터 불러오기

In [1]:
import tensorflow.keras
import pandas as pd

In [2]:
df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
df.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


### 토큰화

In [3]:
# TDM이 아닌 토큰의 리스트로 토큰화함
# 한국어를 처리할 때는 사전에 형태소분석을 진행해야 함.
tok = tensorflow.keras.preprocessing.text.Tokenizer()
# 텍스트가 있는 표의 0번째 열 df[0]을 바탕으로 텍스트의 단어 종류를 학습
# 단어마다 고유 번호를 붙임
tok.fit_on_texts(df[0])

In [4]:
tok.word_index['plug']

155

In [5]:
tok.index_word[155]

'plug'

In [6]:
seq = tok.texts_to_sequences(df[0])

In [7]:
seq[0]

[33,
 117,
 5,
 53,
 214,
 11,
 47,
 8,
 155,
 4,
 19,
 337,
 19,
 1,
 546,
 416,
 2,
 241,
 190,
 6,
 812]

In [8]:
' '.join(tok.index_word[i] for i in seq[0])

'so there is no way for me to plug it in here in the us unless i go by a converter'

### 패딩
---
패딩을 통해 텍스트 길이를 맞춰줌

In [9]:
MAXLEN = max(len(s) for s in seq)

In [10]:
pad = tensorflow.keras.preprocessing.sequence.pad_sequences(seq, MAXLEN)
pad[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  33, 117,   5,  53,
       214,  11,  47,   8, 155,   4,  19, 337,  19,   1, 546, 416,   2,
       241, 190,   6, 812])

### 데이터 분할

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(pad, df[1], test_size=0.2, random_state=1234)

### 모형 만들기

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM

In [14]:
NUM_WORDS = len(tok.index_word) + 1    #패딩 과정에서 0이 더해지기 때문에 +1 해줘야 함.

In [15]:
rnn = Sequential()

# 단어 별 고유번호를 벡터(좌표)로 변환 아래 코드에서는 8차원 벡터
# 벡터를 통해 유사한 의미의 단어 분석에 활용
# mask_zero = True를 통해 0으로 패딩된 값을 학습에서 제외
rnn.add(Embedding(
                  input_dim = NUM_WORDS, 
                  output_dim=8, 
                  input_length=MAXLEN, 
                  mask_zero=True))
# 노드 수 16, 가장 마지막 토큰에만 다음 레이어로 출력을 내보냄
rnn.add(LSTM(16, return_sequences=False))

rnn.add(Dense(1, activation='sigmoid'))

rnn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 8)             15032     
                                                                 
 lstm (LSTM)                 (None, 16)                1600      
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 16,649
Trainable params: 16,649
Non-trainable params: 0
_________________________________________________________________


### 학습하기

In [16]:
from keras.optimizers import Adam

In [17]:
rnn.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
rnn.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x250f5a7bfd0>

In [18]:
from sklearn.metrics import accuracy_score

In [19]:
y_pred = (rnn.predict(X_test) > 0.5).astype("int32")
accuracy_score(y_test, y_pred)



0.81

## 역방향 RNN

In [20]:
rnn2 = Sequential()
rnn2.add(Embedding(input_dim = NUM_WORDS, output_dim=8, input_length=MAXLEN, 
                  mask_zero=True))
# 순환신경망 레이어에 go_backwords = True를 추가하면 역방향으로 단어 처리
rnn2.add(LSTM(16, return_sequences=False, go_backwards = True))
rnn2.add(Dense(1, activation='sigmoid'))
rnn2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 8)             15032     
                                                                 
 lstm_1 (LSTM)               (None, 16)                1600      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 16,649
Trainable params: 16,649
Non-trainable params: 0
_________________________________________________________________


In [21]:
rnn2.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
rnn2.fit(X_train, y_train, epochs=10)
y_pred2 = (rnn.predict(X_test) > 0.5).astype("int32")
accuracy_score(y_test, y_pred2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.81

## 양방향 RNN

In [22]:
from keras.layers import Bidirectional

In [23]:
rnn3 = Sequential()
rnn3.add(Embedding(input_dim = NUM_WORDS, output_dim=8, input_length=MAXLEN, 
                  mask_zero=True))
rnn3.add(Bidirectional(LSTM(16, return_sequences=False)))
rnn3.add(Dense(1, activation='sigmoid'))
rnn3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 8)             15032     
                                                                 
 bidirectional (Bidirectiona  (None, 32)               3200      
 l)                                                              
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 18,265
Trainable params: 18,265
Non-trainable params: 0
_________________________________________________________________


In [24]:
rnn3.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
rnn3.fit(X_train, y_train, epochs=10)
y_pred3 = (rnn.predict(X_test) > 0.5).astype("int32")
accuracy_score(y_test, y_pred3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.81