# Embeddings from Language Model, ELMo
- https://wikidocs.net/33930
- 사전 훈련된 언어 모델(Pre-trained language model)을 사용
- biLM(Bidirectional Language Model)으로 사전 훈련
- ELMo representation - 어떤 단어 w에 대한 모든/각 층의 출력값들을 concatenate + weighted average한 결과
- ELMo representation를 입력값으로 목표하던 테스트 진행, 사전 훈련 모델에서 가중치는 고정하지만 각 벡터들을 weighted average할때 필요한 가중치는 학습하게 됨

## Import

In [15]:
import tensorflow_hub as hub
import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Lambda, Input
import urllib.request
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
print(tf.__version__) # 1.X 버전 사용

1.14.0


## Load dataset

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv", filename="spam.csv")
data = pd.read_csv('spam.csv', encoding='latin-1')
data[:5]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data["v1"] = data["v1"].replace(["ham", "spam"], [0,1])
y_data = list(data["v1"])
x_data = list(data["v2"])
len(x_data), len(y_data)

(5572, 5572)

In [5]:
x_data[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
num_train = int(len(x_data) * 0.8)
num_test = int(len(x_data) - num_train)
print(f"Num of dataset: {len(x_data)}, Num of train set: {num_train}, Num of test set: {num_test}")

Num of dataset: 5572, Num of train set: 4457, Num of test set: 1115


In [7]:
x_train = np.asarray(x_data[:num_train])
y_train = np.asarray(y_data[:num_train])
x_test = np.asarray(x_data[num_train:])
y_test = np.asarray(y_data[num_train:])

## Load ELMo

In [16]:
elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)

In [17]:
sess = tf.Session()
K.set_session(sess)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

## Build model

In [18]:
def ELMoEmbedding(x):
    # 데이터의 이동이 케라스 → 텐서플로우 → 케라스가 되도록 하는 함수
    return elmo(tf.squeeze(tf.cast(x, tf.string)), as_dict=True, signature="default")["default"]

In [None]:
input_text = Input(shape=(1,), dtype=tf.string)
embedding_layer = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
hidden_layer = Dense(256, activation='relu')(embedding_layer)
output_layer = Dense(1, activation='sigmoid')(hidden_layer)
model = Model(inputs=[input_text], outputs=output_layer)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
history = model.fit(x_train, y_train, epochs=1, batch_size=60)



In [22]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(x_test, y_test)[1]))


 테스트 정확도: 0.9821
