In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds

In [2]:
df = pd.read_csv('Dataset/movie_data.csv', encoding='utf-8')

In [3]:
##1ST Make Dataset
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [4]:
##Confirm
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50],ex[1])

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


In [5]:
#전체 데이터셋의 5만개의 샘플을 담고있고
#처음 2.5만개는 평가를 위에 따로 떼어놓음
#그다음 2만개의 샘픙릉ㄴ 훈련에 사용하고
# 남은 5천개의 샘플은 검증에 사용
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)

ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [12]:
##2ST Find token
from collections import Counter
tokenizer = tfds.features.text.Tokenizer()
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)

print(f'어휘 사전 크기 : {len(token_counts)}')

어휘 사전 크기 : 87007


In [13]:
##3ST 고유토큰을 정수로 인코딩하기
encoder = tfds.features.text.TokenTextEncoder(token_counts)
example_str = 'This is an example'
print(f'{encoder.encode(example_str)}')

[232, 9, 270, 1123]


In [14]:
##3-A 단계: 변환을 위한 함수 정의
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    
    return encoded_text, label

In [15]:
##3-B 단계 : 함수를 TF 연산으로 변환하기
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

In [17]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

#샘플의 크키 확인
tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print(f'시퀸스 길이 : {example[0].shape}')

시퀸스 길이 : (24,)
시퀸스 길이 : (179,)
시퀸스 길이 : (262,)
시퀸스 길이 : (535,)
시퀸스 길이 : (130,)


In [18]:
## 3개의 데이터셋을 모두 배치 크기 32의 미니 배치로 나눔 padded_batch사용
train_data = ds_train.padded_batch(32, padded_shapes=([-1],[]))
valid_data = ds_valid.padded_batch(32, padded_shapes=([-1],[]))
teset_data = ds_test.padded_batch(32, padded_shapes=([-1],[]))

In [21]:
from tensorflow.keras.layers import Embedding

model = tf.keras.Sequential()
model.add(Embedding(input_dim=100,
                   output_dim=6,
                   input_length=20,
                   name='embed-layer'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, 20, 6)             600       
Total params: 600
Trainable params: 600
Non-trainable params: 0
_________________________________________________________________


In [22]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          32000     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 36,193
Trainable params: 36,193
Non-trainable params: 0
_________________________________________________________________


In [25]:
embedding_dim = 20
vocab_size = len(token_counts)+2
tf.random.set_seed(1)

#Make Model
bi_1stm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'),
    
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64,name='1stm-layer'),
        name='bidir-1stm'),
    
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [26]:
bi_1stm_model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embed-layer (Embedding)      (None, None, 20)          1740180   
_________________________________________________________________
bidir-1stm (Bidirectional)   (None, 128)               43520     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,792,021
Trainable params: 1,792,021
Non-trainable params: 0
_________________________________________________________________


In [30]:
##컴파일과훈련
bi_1stm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

history = bi_1stm_model.fit(
    train_data,
    validation_data = valid_data,
    epochs = 10)

test_results = bi_1stm_model.evaluate(test_data)

print(f'테스트 정확도 : {test_results[1]*100:.2f}')

Epoch 1/10
     25/Unknown - 57s 2s/step - loss: 0.5395 - accuracy: 0.7891

KeyboardInterrupt: 