![word embeddings vs. one hot encoding](https://s3.amazonaws.com/book.keras.io/img/ch6/word_embeddings.png)

In [0]:
from keras.layers import Embedding
# embedding층은 최소 2개의 매개 변수를 받음
# 마찬가지로 여기에서도 index 0은 사용하지 않음
embedding_layer = Embedding(1000, 64) # samples, sequence length

In [21]:
# imdb 준비해서 리뷰 빈도 1만 개까지 가져오고 리뷰에서 20개 이후 단어는 버림.
# 1만 개의 단어에 대해 8차원의 임베딩을 학습하여 정수(2d int)를 임베딩 시퀀스(3d float)로 바꿈
# 다음 이 텐서를 2d로 바꾸고 펼치고 분류를 위한 denses층을 훈련함

from keras.datasets import imdb
from keras import preprocessing

# 특성으로 사용할 단어 수
max_features = 10000

# 사용할 텍스트 길이
maxlen = 20

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(25000,) (25000,) (25000,) (25000,)


In [22]:
x_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [0]:
# 리스트를 (sample, maxlen) 크기의 2d 정수 텐서로 변환.
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen = maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen = maxlen)

In [24]:
x_train.shape, x_train[0][:10]

((25000, 20),
 array([  65,   16,   38, 1334,   88,   12,   16,  283,    5,   16],
       dtype=int32))

In [0]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
model = Sequential()

# 나중에 임베딩된 입력을 Flatten할 거라 임베딩층에 imput_length를 지정
model.add(Embedding(10000, 8, input_length = maxlen))
# 출력은 (samples, maxlen, 8)이 나옴

In [26]:
model.output

<tf.Tensor 'embedding_5/embedding_lookup/Identity:0' shape=(?, 20, 8) dtype=float32>

In [27]:
model.add(Flatten())
model.output

<tf.Tensor 'flatten_2/Reshape:0' shape=(?, ?) dtype=float32>

In [28]:
model.add(Dense(1, activation = 'sigmoid'))
model.output

<tf.Tensor 'dense_2/Sigmoid:0' shape=(?, 1) dtype=float32>

In [29]:
model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [30]:
history = model.fit(x_train, y_train,
                    epochs = 10,
                    batch_size = 32,
                    validation_split = 0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 사전 훈련된 단어 임베딩 사용하기

In [0]:
# 사전 훈련된 conv net 사용하는 것과 같음
# 충분한 데이터가 없어서 학습시킬 여건이 되지 않지만 일반적인 특성이 필요할 때
# 리뷰 하나를 문자열로 만들어보자

In [3]:
import zipfile
 
try:
    with zipfile.ZipFile('/content/drive/My Drive/dataset/aclImdb.zip') as zf:
        zf.extractall('/content/dataset/')
        print("uncompress success")
 
except:
    print("uncompress fail")



uncompress success


In [4]:
import os
imdb_dir = '/content/dataset/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
  dir_name = os.path.join(train_dir, label_type)
  print(dir_name)
  for fname in os.listdir(dir_name):
    if fname[-4:] == '.txt': # txt 파일이라면
      f = open(os.path.join(dir_name, fname), encoding = 'utf8')
      texts.append(f.read()) # 열어서 텍스트만 저장
      f.close()

      # 부정이면 0, 긍정이면 1 label에 저장
      if label_type == 'neg':
        labels.append(0)
      else:
        labels.append(1)

/content/dataset/aclImdb/train/neg
/content/dataset/aclImdb/train/pos


In [5]:
a = '232231231.txt'
a[-4:]

'.txt'

In [6]:
len(labels), len(texts)

(25000, 25000)

In [7]:
# data tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100 # 100개 이후 단어는 버림
training_samples = 200 # 샘플은 200개
validation_samples = 10000 # 검증 샘플 10000개
max_words = 1000 # 데이터셋에서 가장 빈도가 높은 10000개의 단어만 사용함

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts); sequences

Using TensorFlow backend.


[[11,
  6,
  28,
  4,
  1,
  246,
  99,
  204,
  123,
  107,
  10,
  216,
  9,
  30,
  1,
  30,
  2,
  13,
  573,
  682,
  1,
  164,
  693,
  114,
  41,
  304,
  2,
  57,
  298,
  14,
  73,
  30,
  1,
  11,
  6,
  3,
  450,
  18,
  81,
  68,
  466,
  30,
  86,
  951,
  1,
  19,
  13,
  195,
  112,
  3,
  49,
  1,
  61,
  68,
  2,
  14,
  33,
  68,
  160,
  134,
  1,
  357,
  4,
  1,
  17,
  559,
  407,
  96,
  612,
  1,
  951,
  4,
  13,
  40,
  1,
  812,
  13,
  318,
  187,
  2,
  13,
  239,
  1,
  115,
  170,
  4,
  1,
  17,
  891,
  332,
  28,
  4,
  145,
  929,
  448,
  89,
  16,
  11,
  28],
 [11,
  194,
  193,
  413,
  506,
  515,
  263,
  20,
  1,
  35,
  10,
  69,
  3,
  70,
  4,
  261,
  1,
  113,
  6,
  391,
  2,
  1,
  62,
  344,
  6,
  18,
  9,
  124,
  25,
  24,
  385,
  10,
  101,
  81,
  34,
  11,
  28,
  79,
  691,
  1,
  4,
  9,
  9,
  13,
  90,
  31,
  3,
  434,
  15,
  32,
  361,
  349,
  18,
  15,
  69,
  26,
  158,
  1,
  275,
  20,
  18,
  20,
  1,
  94,
  53,
  1

In [8]:
word_index = tokenizer.word_index; word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'if': 44,
 'has': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [9]:
print('%s개의 고유한 토큰을 찾았습니다.' % len(word_index))
data = pad_sequences(sequences, maxlen = maxlen)
data

88582개의 고유한 토큰을 찾았습니다.


array([[  0,  11,   6, ...,  16,  11,  28],
       [117, 713,   8, ..., 224,   9,  22],
       [492,   4,  12, ..., 155,  36,  69],
       ...,
       [ 24, 105, 398, ...,   4,   5, 513],
       [ 23, 197, 275, ..., 137,  64,   9],
       [758,   1,  34, ...,  67,  25, 138]], dtype=int32)

In [10]:
labels = np.asarray(labels)
print('data tensor size: ', data.shape)
print('label tensor size: ', labels.shape)

data tensor size:  (25000, 100)
label tensor size:  (25000,)


In [0]:
# 데이터를 훈련 세트와 검증 세트로 분할
# 샘플이 순서대로 있어서 섞어줘야 함(부정 후 긍정이 나옴)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [12]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((200, 100), (200,), (10000, 100), (10000,))

In [0]:
# Glove 내려받기
# http://bit.ly/2NJwdb
import zipfile
 
try:
    with zipfile.ZipFile('/content/drive/My Drive/dataset/glove-6b.zip') as zf:
        zf.extractall('/content/dataset/')
        print("uncompress success")
 
except:
    print("uncompress fail")