In [1]:
import json
import numpy as np
from DataLoader import TFRecordLoader, SPARSES, VARLENS
import tensorflow as tf
from tensorflow.keras import Input, layers
from tensorflow.keras.experimental import SequenceFeatures
from tensorflow.keras.utils import plot_model
feature_column = tf.feature_column

In [2]:
col_type = {
    'm1': 'int64',
    'bid': 'int64',
    'adid': 'int64',
    'adspaceid': 'int64',
    'adtype': 'int64',
    'nt': 'int64',
    'appid': 'int64',
    'osv': 'int64',
    'flag': 'int64',
    'p_city': 'int64',
    'install_pkgs': 'int64List',
    'click_adids': 'int64List'
}
label_name = 'flag'
batch_size = 3
embedding_size = 4
epochs = 2
train_path = 'Toydataset/Dataset/ProcessedDataset/train/*.tfrecord'
valid_path = 'Toydataset/Dataset/ProcessedDataset/valid/*.tfrecord'
sparses = [f for f in col_type if col_type[f] in SPARSES and f != label_name]
varlens = [f for f in col_type if col_type[f] in VARLENS and f != label_name]

In [3]:
with open("Config/config_toy.json") as f:
    sparse_len_dic, varlen_len_dic, \
         varlen_maxlen_f, len_train, len_valid = json.load(f)

In [4]:
train = TFRecordLoader(train_path, col_type, label_name=label_name, batch_size=batch_size, 
                        block_length=batch_size,).load()
valid = TFRecordLoader(valid_path, col_type, label_name=label_name, batch_size=batch_size, 
                        block_length=batch_size,).load()

In [5]:
print(*train.take(1))

({'click_adids': <tf.Tensor: shape=(3, 5156), dtype=int64, numpy=
array([[ 1,  2,  3, ...,  0,  0,  0],
       [49, 50, 51, ...,  0,  0,  0],
       [70, 71,  9, ...,  0,  0,  0]])>, 'install_pkgs': <tf.Tensor: shape=(3, 257), dtype=int64, numpy=
array([[ 1,  2,  3,  4,  5,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  

In [6]:
[(example, exmp_label)] = [*train.take(1)]
print(exmp_label)
print(example)

tf.Tensor([1 0 0], shape=(3,), dtype=int64)
{'click_adids': <tf.Tensor: shape=(3, 5156), dtype=int64, numpy=
array([[ 1,  2,  3, ...,  0,  0,  0],
       [49, 50, 51, ...,  0,  0,  0],
       [70, 71,  9, ...,  0,  0,  0]])>, 'install_pkgs': <tf.Tensor: shape=(3, 257), dtype=int64, numpy=
array([[ 1,  2,  3,  4,  5,  6,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0

In [7]:
adid = feature_column.categorical_column_with_identity(
    "adid", num_buckets=sparse_len_dic['adid'])
adid_embedding = feature_column.embedding_column(adid, embedding_size, 
                        initializer=tf.keras.initializers.he_normal(seed=None))
adid_layer = layers.DenseFeatures(adid_embedding, name="adid_emb")
adid_layer(example)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[ 0.09940365, -0.19973306,  0.21417359,  0.39449045],
       [-0.05586148, -0.36310735,  0.11235967, -0.08208282],
       [-0.08816505,  0.09650055, -0.22900821, -0.2994369 ]],
      dtype=float32)>

In [8]:
def sparse_embeding_layer(key, embedding_size, vocab_size, name=None, initializer=None):
    cat_col = feature_column.categorical_column_with_identity(
                                    key, num_buckets=vocab_size)
    col_embedding = feature_column.embedding_column(
                        cat_col, embedding_size, initializer=initializer)
    emblayer = layers.DenseFeatures(col_embedding, name=name if name is not None else key + "_emb")
    return emblayer

In [9]:
adid_layer = sparse_embeding_layer('adid', embedding_size, sparse_len_dic['adid'])

In [10]:
adid_layer(example)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[ 0.37513614, -0.14682907,  0.14456867,  0.20159087],
       [ 0.12208718,  0.8399437 , -0.17844664,  0.74585474],
       [ 0.19035807, -0.4753918 , -0.48422232,  0.6382411 ]],
      dtype=float32)>

In [11]:
varlen_click = feature_column.\
        sequence_categorical_column_with_identity("click_adids", varlen_len_dic["click_adids"])
varlen_click_emb = feature_column.embedding_column(varlen_click, embedding_size, initializer=None)
click_layer = SequenceFeatures(varlen_click_emb)
seq, seq_len = click_layer(example)
seq_mask = tf.sequence_mask(seq_len)
print(f"sequence:\n{seq}\nsequence_length:\n{seq_mask}")

sequence:
[[[ 1.91125631e-01  9.27818954e-01 -4.34757888e-01 -3.48323584e-01]
  [ 1.06005296e-01 -6.73660576e-01 -3.49974722e-01  4.21379745e-01]
  [ 2.06368893e-01 -8.66405904e-01 -7.84736723e-02  1.89559069e-02]
  ...
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]]

 [[-3.58405769e-01  6.08041845e-02 -4.43093687e-01 -1.77497104e-01]
  [ 3.54928851e-01 -6.34709120e-01  7.63670325e-01  5.87668478e-01]
  [-2.70873494e-02  1.77359134e-01  8.59055966e-02  3.56608517e-02]
  ...
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]
  [ 3.20280671e-01 -3.31606328e-01  2.20045343e-01 -9.46716487e-01]]

 [[ 6.85032725e-01  1.74501628e-01  8.99260107e-04 -2.77342558e-01]
  [-4.87223119e-01  8.08861032e-02  8.38968337e-01 -2.25558579e-01]
  [-1.28099084e-01  2.

In [12]:
def varlen_embedding_layer(key, embedding_size, vocab_size, name=None, initializer=None):
    varlen_col = feature_column.\
            sequence_categorical_column_with_identity(key, vocab_size)
    varlen_emb = feature_column.embedding_column(varlen_col, embedding_size, initializer=None)
    seq_layer = SequenceFeatures(varlen_emb, name = name if name is not None else key + '_seq_emb')
    return seq_layer

In [13]:
click_layer = varlen_embedding_layer("click_adids", embedding_size, varlen_len_dic["click_adids"])
seq, seq_len = click_layer(example)
seq_mask = tf.sequence_mask(seq_len)
print(f"sequence:\n{seq}\nsequence_length:\n{seq_mask}")

sequence:
[[[ 0.16454943  0.50537026  0.20934807  0.1092468 ]
  [-0.14189373  0.1253067   0.05022483 -0.01403708]
  [-0.0906689  -0.8214567   0.8211525   0.8974337 ]
  ...
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]]

 [[ 0.07587458 -0.3194842  -0.30769327 -0.06336888]
  [-0.04007056 -0.04407697  0.5631268  -0.2552955 ]
  [-0.8628741   0.27576137  0.47795132  0.43498763]
  ...
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]]

 [[ 0.00316084 -0.38499662 -0.08968228  0.5187847 ]
  [ 0.48632544 -0.0363537  -0.39114013 -0.19388679]
  [-0.20211916  0.37838978  0.13891615  0.60927117]
  ...
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]
  [-0.52956736  0.2305574   0.29842514 -0.8706792 ]]]
sequence_length:
[[ True  True

In [14]:
sparse_embedding_size_dic = {fe: 6 * int(pow(sparse_len_dic[fe], 0.25)) for fe in sparse_len_dic}
varlen_embedding_size_dic = {fe: 6 * int(pow(varlen_len_dic[fe], 0.25)) for fe in varlen_len_dic}
print(sparse_embedding_size_dic)
print(varlen_embedding_size_dic)

{'m1': 54, 'bid': 54, 'adid': 12, 'adspaceid': 6, 'adtype': 6, 'nt': 6, 'osv': 6, 'p_city': 24, 'appid': 6}
{'click_adids': 78, 'install_pkgs': 54}


In [15]:
k = tf.constant([[[1. , 2.], [2., 3.]], [[1., 3.], [2., 4.]]])
tf.math.reduce_mean(k, axis=1,)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[1.5, 2.5],
       [1.5, 3.5]], dtype=float32)>

In [16]:
def DeepVec(sparse_feature_name, varlen_feature_name, target_item_name,
            sparse_len_dic, varlen_len_dic, varlen_maxlen, item_embedding_size = 8,
            hidden_unit_user_info = (80, 12), hidden_unit_target = (80, 12), embedding_size = "auto", combiner='sum'):
    
    # process the embedding size parameter.
    if embedding_size == "auto":
        sparse_embedding_size_dic = {fe: 6 * int(pow(sparse_len_dic[fe], 0.25)) \
                                        for fe in sparse_len_dic}
        varlen_embedding_size_dic = {fe: 6 * int(pow(varlen_len_dic[fe], 0.25)) \
                                        for fe in varlen_len_dic}
    elif isinstance(embedding_size, int):
        sparse_embedding_size_dic = {fe: embedding_size for fe in sparse_len_dic}
        varlen_embedding_size_dic = {fe: embedding_size for fe in varlen_len_dic}
    elif isinstance(embedding_size, dict):
        sparse_embedding_size_dic = {fe: embedding_size[fe] for fe in sparse_len_dic}
        varlen_embedding_size_dic = {fe: embedding_size[fe] for fe in varlen_len_dic}
    else:
        raise ValueError()
    
    if len(sparse_feature_name + varlen_feature_name) == 0:
        raise ValueError("sparse_feature_name and varlen_feature_name are not None.")

    # build the input layers.
    inputs_dic = {}
    for spa in sparse_feature_name:
        inputs_dic.update({spa: Input(shape=(1,), name = spa)})
        
    for var in varlen_feature_name:
        inputs_dic.update({var: Input(shape=(varlen_maxlen[spa],), name = var)})

    # build the user and context info vector part.
    sparse_emb_layers = []
    for spa in sparse_feature_name:
        if spa != target_item_name:
            sparse_emb_layer = sparse_embeding_layer(spa, 
                                embedding_size=sparse_embedding_size_dic[spa],
                                vocab_size=sparse_len_dic[spa],
                                )

            sparse_emb_layers.append(sparse_emb_layer(inputs_dic[spa]))
    
    varlen_emb_layers = []
    for var in varlen_feature_name:
        varlen_emb_layer = varlen_embedding_layer(var, 
                              embedding_size=varlen_embedding_size_dic[var],
                              vocab_size=varlen_len_dic[var],
                            )
        varlen_seq, seq_len = varlen_emb_layer(inputs_dic[var])
        sumpooling_layer = SumPooling(name=var)
        varlen_emb_layers.append(sumpooling_layer(varlen_seq, mask=seq_len, mode=combiner))
    
    concat = layers.Concatenate(axis=-1)(sparse_emb_layers + varlen_emb_layers)

    DNN = concat
    for i, u in enumerate(hidden_unit_user_info):
        DNN = layers.Dense(u, name=f"DNN_{i}_user_info")(DNN)
    user_info = DNN

    target_vec = sparse_embeding_layer(target_item_name, embedding_size[target_item_name],
                            vocab_size = sparse_embedding_size_dic[spa],
                            name="target_emb_layer")

    DNN = target_vec(inputs_dic[target_item_name])
    for i, u in enumerate(hidden_unit_target):
        DNN = layers.Dense(u, name=f"DNN_{i}_target")(DNN)
    target_info = DNN

    outputs = layers.Dot(axes=-1, normalize=True)([user_info, target_info])

    model = tf.keras.Model(inputs=inputs_dic, outputs=outputs)

    return model



In [17]:
model = DeepVec(sparses, varlens, "adid", sparse_len_dic, 
                varlen_len_dic, varlen_maxlen_f)
model.summary()
plot_model(model, "DeepVec.png", show_shapes=True)

KeyError: 'p_city'