In [97]:
import os
import tensorflow as tf
import numpy as np

In [98]:
vocab_size = 32000
sequence_length = 64

In [99]:
ckpt_dir = tf.train.latest_checkpoint('./model-64-32k-100k/ckpt/') 
ckpt_dir

'./model-64-32k-100k/ckpt/w2v_embedding_64-1'

In [100]:
w2v_layer = tf.keras.layers.Embedding(vocab_size,
                                      sequence_length,
                                      input_length=1,
                                      name="w2v_embedding")


In [101]:
w2v_layer.weights

[]

In [102]:
model = tf.keras.Sequential([
    w2v_layer,
])
model.compile(optimizer='adam')

In [103]:
w2v_layer.weights[0]

<tf.Variable 'w2v_embedding/embeddings:0' shape=(32000, 64) dtype=float32, numpy=
array([[ 0.0321597 ,  0.01368393, -0.02549728, ..., -0.0174865 ,
        -0.03071706,  0.03416361],
       [ 0.00035464,  0.02307502, -0.01652487, ...,  0.03385301,
         0.02494711,  0.03013298],
       [-0.01363196,  0.02859947,  0.0180531 , ..., -0.04400091,
        -0.04487803, -0.0372056 ],
       ...,
       [ 0.02246389,  0.01087012, -0.03487611, ..., -0.00122337,
        -0.02905982, -0.04636227],
       [ 0.04016094, -0.01431141,  0.04110011, ..., -0.02397348,
        -0.03966268,  0.00605388],
       [-0.00015162,  0.01626218,  0.03629174, ...,  0.00712913,
         0.04331812, -0.00718949]], dtype=float32)>

In [104]:
checkpoint = tf.train.Checkpoint(layer=w2v_layer)
checkpoint

<tensorflow.python.training.tracking.util.Checkpoint at 0x2bdd66c72e0>

In [105]:
checkpoint.restore(ckpt_dir).assert_consumed()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2bdd124fd90>

In [106]:
w2v_layer.weights[0]

<tf.Variable 'w2v_embedding/embeddings:0' shape=(32000, 64) dtype=float32, numpy=
array([[-0.02971295,  0.00410122, -0.0094048 , ...,  0.02149712,
         0.00644837,  0.01568511],
       [-0.2899739 , -0.02694705, -0.13824375, ...,  0.16128096,
         0.15405932,  0.12712628],
       [-0.1654938 ,  0.05598264,  0.0244594 , ..., -0.17019223,
        -0.35384795,  0.14580142],
       ...,
       [-0.21116109, -0.5072469 ,  2.1389928 , ..., -0.7406144 ,
         1.152412  ,  0.15577023],
       [-0.4351952 ,  0.3172441 ,  0.8916813 , ..., -0.07095525,
         0.547772  , -0.8520549 ],
       [ 0.47548616, -0.36260402,  0.41800928, ...,  0.8716261 ,
        -0.32832038, -0.88099927]], dtype=float32)>

In [107]:
MODEL_PATH = os.path.join('..', 'word2vec', 'model-64-32k-100k')
MODEL_PATH

'..\\word2vec\\model-64-32k-100k'

In [108]:
vocab_ds = tf.data.TextLineDataset(os.path.join(MODEL_PATH, 'metadata.tsv')).filter(
        # ignore [UNK] token
        lambda text: tf.cast(not tf.strings.regex_full_match(text, '\[UNK\]'), bool))


In [109]:
# Define the vocabulary size and the number of words in a sequence.
vocab_size = 32000
sequence_length = 64

vectorize_layer = tf.keras.layers.TextVectorization(
        max_tokens=vocab_size,
        output_sequence_length=sequence_length,
        # add vocab
        vocabulary=tf.constant(
            [text.numpy() for text in vocab_ds]))

In [110]:
vocab = vectorize_layer.get_vocabulary()
vocab[:5], vocab[-5:]

(['', '[UNK]', 'dan', 'yang', 'di'],
 ['cip', 'charly', 'channing', 'casper', 'capacity'])

In [111]:
def similarity(a, b):
    a = vectorize_layer(tf.constant(a))
    b = vectorize_layer(tf.constant(b))
    vec_a = w2v_layer(a)
    print("Vector A:", vec_a)
    vec_b = w2v_layer(b)
    print("Vector B:", vec_b)
    cosine_similarities = np.dot(a, b) / (np.linalg.norm(a)* np.linalg.norm(b))
    print("Similarity:", cosine_similarities)


In [112]:
similarity("jakarta", "ibukota")

Vector A: tf.Tensor(
[[-0.17076534  0.09946204 -0.03591677 ...  0.63685346 -0.05752937
   0.21611916]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 ...
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]], shape=(64, 64), dtype=float32)
Vector B: tf.Tensor(
[[ 0.10179348  0.22054295 -0.58075035 ...  0.20306358 -0.7312863
   0.7635352 ]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 ...
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122

In [113]:
similarity("teman", "sahabat")

Vector A: tf.Tensor(
[[-0.3711237   0.6030821   0.5711786  ... -0.05788724  1.2050811
   0.19838865]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 ...
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]], shape=(64, 64), dtype=float32)
Vector B: tf.Tensor(
[[-0.55585283  0.5244362  -0.4527037  ... -0.65651584  1.2318995
  -0.27024126]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 ...
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 -0.0094048  ...  0.02149712  0.00644837
   0.01568511]
 [-0.02971295  0.00410122 