
# [Tensorflow in Practice] Course 3 - 요약


---


#### - 학습데이터 loading

*   tfds에서 load
*   wget으로 download

#### - 전처리 

*  1.Tokenizer
*  2.Text to Sequence
*  3.Padding  
*  4.numpy array로 바꾸기
*  이미 전처리 되어있는거 loading할때(imdb_subwords8k) 처리방법


#### - 모델링

*   Embedding
*   LSTM
*   GRU
*   Conv1D


#### - Task

*   이진 감성분석
*   다음 단어 예측하기 (문장생성)


#### - 임베딩 Transfer Learning











## imdb_review / 기본전처리 / 이진분류

In [None]:
import tensorflow as tf
import numpy as np
print(tf.__version__)

####### DATA LOAD
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True) # 영화리뷰 긍부정

train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
testing_sentences = []
training_labels = []
testing_labels = []

for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

####### Configuration
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

####### Tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

####### Text to Sequence
sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

####### Padding Sequence
padded = pad_sequences(sequences,maxlen=2, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

####### MODELING
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),  # 임베딩layer shape: (vocab_size, embedding_dim)
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### COMPILE
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

####### FIT
model.fit(padded, training_labels_final, epochs=10, validation_data=(testing_padded, testing_labels_final))

## sarcasm / 기본전처리 / 이진분류 / 여러모델


In [None]:

try:
  %tensorflow_version 2.x
except Exception:
  pass

import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

####### DATA DOWNLOAD
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])


####### Split train / test
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

####### Tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

####### Text to Sequence
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

####### Padding
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

####### Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

####### MODELING 1
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### MODELING 2 : Single Layer LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### MODELING 3 : Multiple Layer LSTM
model = tf.keras.Sequential([
    #tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### MODELING 4 : with Conv1D
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### MODELING 5 : Single Layer GRU
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### COMPILE
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

####### Fit
history = model.fit(training_padded, training_labels, epochs=30, validation_data=(testing_padded, testing_labels), verbose=2)

####### Predict : 테스트데이터도 tokenizer로 sequence만들고 padding한다음 predict
sentence = ["granny starting to fear spiders in the garden might be real", "game of thrones season finale showing this sunday night"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(model.predict(padded))

## imdb_subwords8k / 전처리 좀 다름 / 이진분류

In [None]:


import tensorflow as tf
print(tf.__version__)

####### Data Download
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews/subwords8k", with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']

tokenizer = info.features['text'].encoder

BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_data.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(train_dataset))
test_dataset = test_data.padded_batch(BATCH_SIZE, tf.compat.v1.data.get_output_shapes(test_data))

####### Modeling
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

####### Modeling 2
# model = tf.keras.Sequential([
#     tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])

####### Compile
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

####### Fit
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset)

####### History Plotting
import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

## irish-lyrics / 다음 단어 예측하기 (문장 생성) / 학습데이터 만드는 전처리

In [None]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np 


###### Data Download
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/irish-lyrics-eof.txt \
    -O /tmp/irish-lyrics-eof.txt

data = open('/tmp/irish-lyrics-eof.txt').read()
corpus = data.lower().split("\n")

###### Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

###### 다음 단어 예측하기 Task를 위한 학습데이터 생성 
input_sequences = []
for line in corpus:
    # Text to Sequence
	token_list = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(token_list)):
		input_sequences.append(token_list[:i+1])

###### Padding 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

###### create predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

###### 원핫 인코딩
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

###### Modeling

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=max_sequence_len-1),
    tf.keras.layers.Bidirectional(LSTM(150)),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

###### Compile
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')

###### Fit
history = model.fit(xs, ys, epochs=10, verbose=1)
#print model.summary()


###### History
import matplotlib.pyplot as plt
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

plot_graphs(history, 'accuracy')


###### 다음단어 예측을 통한 문장 생성하기(seed문장의 다음단어 예측하고, 그걸 seed에 붙인다음 다음단어를 다시 예측하고... 반복 )
seed_text = "I've got a bad feeling about this"
next_words = 100
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)

## Transfer Learning (GloVe : Global Word Vectors)
 임베딩layer의 weight, dimension을 첨부터 학습시키는게 아니라 다운받아서 그냥 그거 넣어주는 방식

In [None]:
###### Download Pretrained Embedding (100 dimension version of GloVe from Stanford)

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt

embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;


###### Modeling
model = tf.keras.Sequential([
    # embeddings_matrix 넣고 Freezing
    tf.keras.layers.Embedding(vocab_size+1, 
                              embedding_dim, 
                              input_length=max_length, 
                              weights=[embeddings_matrix], 
                              trainable=False), 
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

###### Compile
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

###### Fit
history = model.fit(training_padded, training_labels, epochs=10, validation_data=(testing_padded, testing_labels), verbose=2)
