In [1]:
# 앞서 저장해놓은 데이터 불러오기 
import joblib 
tk = joblib.load('tokenizer.pkl')
xs, ys = joblib.load('Im-data.pkl')

In [2]:
import tensorflow as tf 

In [3]:
# padding 시 길이를 맞추기 위해 0이 들어감으로 +1을 해줘야함 
NUM_WORD = tk.num_words + 1

In [4]:
emb1 = tf.keras.layers.Embedding(NUM_WORD, 8)
lm = tf.keras.Sequential([
    emb1,
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(NUM_WORD)
])

In [5]:
lm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d (Gl (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 2001)              18009     
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [6]:
# 위 모델에서 마지막층에 softmax를 지정해주지 않았기 때문에 너가 softmax를 지정해서 해라 
lm.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
          optimizer='adam',
          metrics=['accuracy'])

In [7]:
lm.fit(xs, ys, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x16e6cd14e08>

In [18]:
lm.save('lm.krs')

INFO:tensorflow:Assets written to: lm.krs\assets


In [8]:
# 각 단어마다 임배딩된 가중치 뽑아내기 
e = emb1.embeddings.numpy()

In [9]:
e

array([[-1.1373151e-02, -1.7229237e-02,  2.6770387e-02, ...,
         3.3860866e-02, -2.6928341e-02, -4.1893832e-03],
       [-3.2359004e-01,  3.7504098e-01, -3.5376418e-01, ...,
        -3.5844690e-01, -3.4970707e-01, -3.6107150e-01],
       [-3.7775823e-01,  2.7817586e-01, -3.0898097e-01, ...,
        -3.2194078e-01, -3.4206164e-01, -2.9082528e-01],
       ...,
       [-1.7133202e-02,  5.3317398e-02, -1.7587679e-02, ...,
         1.8634712e-02, -7.2970882e-04, -3.1927627e-02],
       [-1.4605530e-02,  6.4230822e-02, -7.1551763e-02, ...,
        -1.3523963e-02, -2.3572671e-04, -2.9528910e-02],
       [-6.0259216e-03, -3.0688215e-02,  4.4484735e-03, ...,
         2.4091601e-03,  3.4332920e-02, -3.8394891e-02]], dtype=float32)

In [10]:
e.shape

(2001, 8)

In [13]:
import pandas as pd 
pd.DataFrame(e)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.011373,-0.017229,0.026770,0.020647,0.027772,0.033861,-0.026928,-0.004189
1,-0.323590,0.375041,-0.353764,-0.396250,-0.371020,-0.358447,-0.349707,-0.361071
2,-0.377758,0.278176,-0.308981,-0.330008,-0.285581,-0.321941,-0.342062,-0.290825
3,-0.260480,0.303689,-0.336651,-0.309811,-0.257016,-0.332278,-0.305837,-0.327077
4,-0.271880,0.248783,-0.275293,-0.227506,-0.268566,-0.336588,-0.329116,-0.295599
...,...,...,...,...,...,...,...,...
1996,-0.060670,0.041595,-0.055225,-0.022057,0.022843,-0.065610,-0.057935,-0.068505
1997,0.005306,-0.023956,0.041421,-0.028890,0.011541,0.003380,-0.070174,0.011634
1998,-0.017133,0.053317,-0.017588,-0.041229,-0.080367,0.018635,-0.000730,-0.031928
1999,-0.014606,0.064231,-0.071552,-0.012588,-0.011700,-0.013524,-0.000236,-0.029529


In [11]:
import numpy as np 
emb1.get_weights()[0]

array([[-1.1373151e-02, -1.7229237e-02,  2.6770387e-02, ...,
         3.3860866e-02, -2.6928341e-02, -4.1893832e-03],
       [-3.2359004e-01,  3.7504098e-01, -3.5376418e-01, ...,
        -3.5844690e-01, -3.4970707e-01, -3.6107150e-01],
       [-3.7775823e-01,  2.7817586e-01, -3.0898097e-01, ...,
        -3.2194078e-01, -3.4206164e-01, -2.9082528e-01],
       ...,
       [-1.7133202e-02,  5.3317398e-02, -1.7587679e-02, ...,
         1.8634712e-02, -7.2970882e-04, -3.1927627e-02],
       [-1.4605530e-02,  6.4230822e-02, -7.1551763e-02, ...,
        -1.3523963e-02, -2.3572671e-04, -2.9528910e-02],
       [-6.0259216e-03, -3.0688215e-02,  4.4484735e-03, ...,
         2.4091601e-03,  3.4332920e-02, -3.8394891e-02]], dtype=float32)

In [28]:
w = emb1.get_weights()[0]

In [29]:
np.array_equal(e, w) # 두배열이 같은지 확인 

True

In [30]:
# 압축하지 않고 numpy array 저장 
np.savez('word-emb.npz', emb=e)