In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras

In [7]:
train_dataset = tf_keras.utils.text_dataset_from_directory("data_files/aclImdb/train", batch_size=32)
test_dataset = tf_keras.utils.text_dataset_from_directory('data_files/aclImdb/test', batch_size=32)

# review_only_dataset = train_datasest.map(lambda X, y: X)
review_only_dataset = train_dataset.map(lambda review, label: review)
# train_dataset에 review와 label 있는데, 그중에서 reivew만 가져오기

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [31]:
type(train_dataset)

tensorflow.python.data.ops.batch_op._BatchDataset

In [8]:
for X, y in train_dataset: # review를 X로, label을 y로 받음
    print(X.shape, y.shape)
    print(X[0])
    print(y[0])
    break

(32,) (32,)
tf.Tensor(b'Blood Surf AKA Krocodylus is a fair film that has an okay cast which includes Dax Miller, Taryn Reif, Kate Fischer, Duncan Regehr, Joel West, Matt Borlenghi, Maureen Larrazabal, Cris Vertido, Susan Africa, Archie Adamos, Rolando Santo Domingo, and Malecio Amayao. The acting by the actors is fairly good. The thrills are fairly good and some of it is surprising. The movie is filmed fairly good as well. Same thing goes for the music The film is fairly interesting and the movie does keeps you going until the end. This is a fairly thrilling film. If you the the cast in the film, Monsters, Giant Animal films, Horror, Thrillers, Mystery, and interesting films then I recommend you to see this film today!', shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)


In [9]:
# 문장(보통 document라 함. ) (단어 집합) -> 숫자 집합 : encoding
# max_tokens=단어 사전 크기 혹은 총 단어 개수다. 이걸 넘는 단어는 '모르는 단어' 취급함
# output_mode = 출력 유형
# output_sequence_length = 한 문장에 들어가는 단어 수. 모자르면 패딩, 넘으면 자름
text_vectorizer = tf_keras.layers.TextVectorization(max_tokens=20000, 
                                                    output_mode='int', 
                                                    output_sequence_length=300)

text_vectorizer.adapt(review_only_dataset)
# review_only_dataset 이 데이터를 가지고 변환기(vectorizer)에 단어 사전 만들기





In [10]:
# (문장 -> 숫자 리스트) 변환기 테스트
for X, y in train_dataset:
    d = text_vectorizer(X)  # 변환 실행 [X: (32, 1) -> X: (32, 300)]
    # batch_size=32라서 X는 32문장이 나오는데, 위에서 한 문장당 300 단어 (정확히는 토큰) 쓰기로 했음
    print(d.shape)
    print(d)
    break

(32, 300)
tf.Tensor(
[[  57 1198  132 ...    0    0    0]
 [  86    6  856 ...   15    4  812]
 [  84  225   84 ...    0    0    0]
 ...
 [2452 1360   20 ...    0    0    0]
 [   8    2 2346 ...    0    0    0]
 [  11   44  203 ...    0    0    0]], shape=(32, 300), dtype=int64)


In [11]:
# 단어 사전 확인
dictionary = text_vectorizer.get_vocabulary()
print(len(dictionary))
# 단어 사전에 없는 단어 표시 : UNK(unknown) (혹은 oov (out of vocabulary))
dictionary[10:20]

20000


['i', 'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but']

In [12]:
# 숫자로 인코딩된 문장을 원래 문장으로 복원
print(d[1][:13].numpy())    # .numpy()를 붙이면 숫자만 볼 수 있다.
for t in d[1]:
    if t != 0:
        print(dictionary[t], end=" ")

[  86    6  856    1   13  527  934   56 1336   86   10  153  888]
first to die [UNK] br ill admit my mistake first i didnt realize this was a made for tv movie i was thrown off by the r [UNK] the plot is strong but the movie is about 40 minutes too long the direction and continuity were excellent for the most part the cast was exceptional and did a good job with their characters the down side of the movie is that it definitely falls into the chick flick genre although there are some violent scenes none of the violence should call for an r rating there is no nudity or gratuitous sex scenes actually there are no sex scenes [UNK] [UNK] who is absolutely beautiful [UNK] [UNK] [UNK] [UNK] and [UNK] davies were all guests on the sg1 series but this movie did nothing to advance their careers since they were all used as low level supporting actresses robert patrick was fantastic as he usually is and mitch [UNK] made me think of a modern day lee marvin the very talented megan [UNK] who i came 

In [13]:
# Embedding 모델 만들기 : 단어(토큰을 벡터로 만드는 모델) # 벡터 여러개의 데이터
# 얘만 가지고 뭘 하진 못하고 이걸 만들어서 학습 과정에 넣어줘야한다.

input = tf_keras.layers.Input(shape=(None, ))
# input_dim = 단어 사전, output_dim = 단어 하나가 가질 의미의 수
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)

embedding_model = tf_keras.models.Model(input, output)

In [None]:
for review in review_only_dataset:
    # print(review)
    vectorized_review = text_vectorizer(review)                 # 단어 1개 -> 숫자 1개
    # embedded_review = embedding_model(text_vectorizer(review))# 단어 1개 -> 숫자 100개
    embedded_review = embedding_model(vectorized_review)        # 숫자 1개 -> 숫자 100개 
    break
# text_vectorizer(review) 이걸 한 뒤 변환된 출력을 embedding_model의 입력으로 받는다.

In [16]:
vectorized_review.shape, embedded_review.shape
# (TensorShape([32, 300]), TensorShape([32, 300, 100]))
# 이걸 활용해서 문맥에 따라 서로 다른 정보를 가질 수 있도록 넓은 공간에 데이터를 저장하는 구조

(TensorShape([32, 300]), TensorShape([32, 300, 100]))

In [17]:
# 훈련 데이터의 모든 문자열(리뷰)을 숫자로 변경
# 리뷰는 텍스트 벡터라이저를 통해 수치화한다.
vectorized_train_dataset = train_dataset.map(lambda review, label: (text_vectorizer(review), label))

In [18]:
# 변경 확인
for X, y in vectorized_train_dataset:
    print(X)
    break

tf.Tensor(
[[ 131   11    7 ...    0    0    0]
 [ 147    9  457 ...    0    0    0]
 [1210 1492 1770 ...  287  560 8370]
 ...
 [  10  237  485 ...    0    0    0]
 [7925    7   22 ...    0    0    0]
 [  15  513   15 ...    0    0    0]], shape=(32, 300), dtype=int64)


# RNN

In [27]:
# 모델 구조 설계 : 텍스트 데이터 처리를 위한 순환신경망 모델

input = tf_keras.layers.Input(shape=(None,))
x = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)   # None, 300, 100
x = tf_keras.layers.LSTM(units=16)(x)
output = tf_keras.layers.Dense(units=1, activation='sigmoid')(x)

model = tf_keras.models.Model(input, output)

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 100)         2000000   
                                                                 
 lstm_2 (LSTM)               (None, 16)                7488      
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2007505 (7.66 MB)
Trainable params: 2007505 (7.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
# 모델 학습 설계
model.compile(loss='binary_crossentropy', 
              optimizer = 'adam',
              metrics = ['accuracy'])

In [29]:
# 모델 학습
fit_history = model.fit(vectorized_train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate()