In [1]:
import time
start = time.time()
import tensorflow as tf
from tensorflow import keras
import numpy as np
from keras import layers
tf.random.set_seed(0)
np.random.seed(0)
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from sklearn.preprocessing import normalize

In [2]:
def read_multivariate_dataset(root_dir, dataset_name):
    """ Read multivariate dataset
    """
    X = np.load(os.path.join(root_dir, dataset_name+".npy"), allow_pickle=True)
    y = np.loadtxt(os.path.join(root_dir, dataset_name+'_label.txt'))
    y = y.astype(np.int64)

    dc = {'CharacterTrajectories_train_size': 1422,
          'CharacterTrajectories_eq_train_size': 1422,
               'PhonemeSpectra_train_size': 3315,
               'Handwriting_train_size': 150,
               'RacketSports_train_size': 151}



    dim = X[0].shape[0]
    max_length = 0
    for _X in X:
        if _X.shape[1] > max_length:
            max_length = _X.shape[1]

    X_list = []
    for i in range(len(X)):
        _X = np.zeros((dim, max_length))
        _X[:, :X[i].shape[1]] = X[i]
        X_list.append(_X)
    X = np.array(X_list, dtype=np.float32)

    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)

    idx = np.array([i for i in range(len(X))])

    train_size=dc[str(dataset_name)+'_train_size']

    # np.random.shuffle(idx)
    train_idx, test_idx = idx[:train_size], idx[train_size:]
            
    print(len(train_idx))
    print(len(test_idx))

    x_train = X[train_idx]
    y_train = y[train_idx]

    x_test = X[test_idx]
    y_test = y[test_idx]

    return x_train, y_train, x_test, y_test


In [3]:
root_url = os.getcwd()
folder = "/datasets/multivariate/"
dataset = "Handwriting"   # Handwriting, CharacterTrajectories, PhonemeSpectra
x_train, y_train, x_test, y_test = read_multivariate_dataset(root_url+folder, dataset)

n_classes = len(np.unique(y_train))

x_train = x_train.transpose(0,2,1)
x_test = x_test.transpose(0,2,1)

150
850


In [4]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x, att_scores = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs, return_attention_scores=True)
    x = layers.Dropout(dropout)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    # Feed Forward Part
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res, att_scores

In [5]:
def build_model(
    input_shape,
    head_size,
    num_heads,
    ff_dim,
    num_transformer_blocks,
    mlp_units,
    dropout=0,
    mlp_dropout=0,
):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x, att_scores = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(n_classes, activation="softmax")(x)
    return keras.Model(inputs, outputs), att_scores

In [6]:
input_shape = x_train.shape[1:]   # time steps

model, att_scores = build_model(
    input_shape,
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.4,
    dropout=0.25,
)

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=5e-4),   # 1e-4 and 1e-3
    metrics=["sparse_categorical_accuracy"],
)
model.summary()

callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]

model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=128,
    callbacks=callbacks,
)

# model.evaluate(x_test, y_test, verbose=1)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 152, 3)]     0                                            
__________________________________________________________________________________________________
multi_head_attention (MultiHead ((None, 152, 3), (No 15363       input_1[0][0]                    
                                                                 input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 152, 3)       0           multi_head_attention[0][0]       
__________________________________________________________________________________________________
layer_normalization (LayerNorma (None, 152, 3)       6           dropout[0][0]                

<keras.callbacks.History at 0x1a264a98a60>

# Full Dataset

In [7]:
model.get_layer('multi_head_attention').output

(<KerasTensor: shape=(None, 152, 3) dtype=float32 (created by layer 'multi_head_attention')>,
 <KerasTensor: shape=(None, 4, 152, 152) dtype=float32 (created by layer 'multi_head_attention')>)

In [8]:
X = np.concatenate((x_train, x_test))
y = np.concatenate((y_train, y_test))

mha = keras.models.Model(inputs=model.input, outputs=[model.get_layer('multi_head_attention').output, att_scores]) # multi_head_attention_21 and att_score as outputs
out, att_scores_mha3 = mha.predict(X)    # returns 'out': specified multi-head attention (mha) layer attention outputs and attention scores, 'att_scores_mha3': last mha layer scores.
attention_outputs, attention_scores = out

# Attention Scores

In [9]:
print('attention_scores dims:', attention_scores.shape)
print('attention_outputs dims:', attention_outputs.shape)


attention_scores dims: (1000, 4, 152, 152)
attention_outputs dims: (1000, 152, 3)


In [10]:
# mean_attention_scores = np.mean(attention_scores, axis=1)
mean_attention_scores = np.mean(attention_scores, axis=1)    # attention_scores or out[1]

# Flatten the attention score matrices for each time series
flattened_scores = mean_attention_scores.reshape(len(X), -1)
normalized_scores = normalize(flattened_scores, axis=1)    # For dot product

# Compute cosine similarity, euclidean distance, dot-product
similarity_matrix_cos = cosine_similarity(flattened_scores)
similarity_matrix_euc = euclidean_distances(flattened_scores)
# similarity_matrix_euc = similarity_matrix_euc / np.max(similarity_matrix_euc)
similarity_matrix_euc2 = 1 - similarity_matrix_euc / np.max(similarity_matrix_euc)
similarity_matrix_dotp = np.dot(normalized_scores, normalized_scores.T)

print("Similarity Matrix (Cosine Similarity):\n", similarity_matrix_cos)
print("Similarity Matrix (Euclidean Distance):\n", similarity_matrix_euc)
print("Similarity Matrix (Euclidean Distance2):\n", similarity_matrix_euc2)
print("Similarity Matrix (Dot Product Similarity):\n", similarity_matrix_dotp)

Similarity Matrix (Cosine Similarity):
 [[1.0000092  0.9999824  0.9999682  ... 0.99997365 0.9999483  0.9999755 ]
 [0.9999824  1.0000196  0.99997514 ... 0.9999855  0.99996156 0.99998754]
 [0.9999682  0.99997514 0.9999805  ... 0.9999697  0.99994564 0.9999677 ]
 ...
 [0.99997365 0.9999855  0.9999697  ... 1.0000155  0.99995327 0.999978  ]
 [0.9999483  0.99996156 0.99994564 ... 0.99995327 0.99999213 0.9999508 ]
 [0.9999755  0.99998754 0.9999677  ... 0.999978   0.9999508  1.0000153 ]]
Similarity Matrix (Euclidean Distance):
 [[0.         0.00807354 0.00737326 ... 0.00884928 0.01024653 0.00854472]
 [0.00807354 0.         0.00708857 ... 0.00794619 0.00935293 0.00792544]
 [0.00737326 0.00708857 0.         ... 0.00744552 0.00894186 0.00768978]
 ...
 [0.00884928 0.00794619 0.00744552 ... 0.         0.00998135 0.00865518]
 [0.01024653 0.00935293 0.00894186 ... 0.00998135 0.         0.0102925 ]
 [0.00854472 0.00792544 0.00768978 ... 0.00865518 0.0102925  0.        ]]
Similarity Matrix (Euclidean Di

# Attention Weights

In [11]:
# Get weights of attention scores for each time series
attention_weights = np.exp(flattened_scores) / np.sum(np.exp(flattened_scores), axis=1, keepdims=True)
normalized_weights = normalize(attention_weights, axis=1)

# Compute cosine similarity, euclidean distance, dot-product
similarity_matrix_cos_aw = cosine_similarity(attention_weights)
similarity_matrix_euc_aw = euclidean_distances(attention_weights)
# similarity_matrix_euc_aw = similarity_matrix_euc / np.max(similarity_matrix_euc)
similarity_matrix_euc2_aw = 1 - similarity_matrix_euc / np.max(similarity_matrix_euc)
similarity_matrix_dotp_aw = np.dot(normalized_weights, normalized_weights.T)

print("Similarity Matrix (Cosine Similarity):\n", similarity_matrix_cos_aw)
print("Similarity Matrix (Euclidean Distance):\n", similarity_matrix_euc_aw)
print("Similarity Matrix (Euclidean Distance2):\n", similarity_matrix_euc2_aw)
print("Similarity Matrix (Dot Product Similarity):\n", similarity_matrix_dotp_aw)

Similarity Matrix (Cosine Similarity):
 [[0.9999516  0.99995023 0.9999505  ... 0.9999504  0.9999501  0.999951  ]
 [0.99995023 0.99994814 0.9999489  ... 0.9999486  0.9999484  0.99994963]
 [0.9999505  0.9999489  0.9999497  ... 0.9999493  0.99994856 0.9999499 ]
 ...
 [0.9999504  0.9999486  0.9999493  ... 0.9999493  0.9999487  0.9999499 ]
 [0.9999501  0.9999484  0.99994856 ... 0.9999487  0.99994797 0.99994946]
 [0.999951   0.99994963 0.9999499  ... 0.9999499  0.99994946 0.99995035]]
Similarity Matrix (Euclidean Distance):
 [[0.0000000e+00 3.4944989e-07 3.1912901e-07 ... 3.8303170e-07
  4.4348832e-07 3.6983525e-07]
 [3.4944989e-07 0.0000000e+00 3.0682921e-07 ... 3.4396083e-07
  4.0481515e-07 3.4304708e-07]
 [3.1912904e-07 3.0682921e-07 0.0000000e+00 ... 3.2229426e-07
  3.8702808e-07 3.3284653e-07]
 ...
 [3.8303170e-07 3.4396083e-07 3.2229426e-07 ... 0.0000000e+00
  4.3202778e-07 3.7463684e-07]
 [4.4348832e-07 4.0481515e-07 3.8702808e-07 ... 4.3202778e-07
  0.0000000e+00 4.4547809e-07]
 [3.6

# Attention Outputs

In [12]:
attention_outputs.shape  # att_outputs

(1000, 152, 3)

In [13]:
# Reshape the attention outputs to combine batch_size and seq_len for each time series
# mean_attention_outputs = np.mean(attention_outputs, axis=1)  # bad result
reshaped_att_outputs = attention_outputs.reshape(len(X), -1)  # (batch_size, seq_len * embed_dim)
normalized_outputs = normalize(reshaped_att_outputs, axis=1)

# Compute cosine similarity
similarity_matrix_cos_ao = cosine_similarity(reshaped_att_outputs)
similarity_matrix_euc_ao = euclidean_distances(reshaped_att_outputs)
# similarity_matrix_euc_ao = similarity_matrix_euc / np.max(similarity_matrix_euc)
similarity_matrix_euc2_ao = 1 - similarity_matrix_euc_ao / np.max(similarity_matrix_euc_ao)
similarity_matrix_dotp_ao = np.dot(normalized_outputs, normalized_outputs.T)

print("Similarity Matrix (Cosine Similarity):\n", similarity_matrix_cos_ao)
print("Similarity Matrix (Euclidean Distance):\n", similarity_matrix_euc_ao)
print("Similarity Matrix (Euclidean Distance2):\n", similarity_matrix_euc2_ao)
print("Similarity Matrix (Dot Product Similarity):\n", similarity_matrix_dotp_ao)

Similarity Matrix (Cosine Similarity):
 [[1.0000001  0.9979965  0.9986291  ... 0.99843097 0.9980059  0.9995406 ]
 [0.9979965  1.0000004  0.99769616 ... 0.9978208  0.99836934 0.99840117]
 [0.9986291  0.99769616 0.9999993  ... 0.99926984 0.99774843 0.99868834]
 ...
 [0.99843097 0.9978208  0.99926984 ... 1.0000004  0.9975196  0.9986215 ]
 [0.9980059  0.99836934 0.99774843 ... 0.9975196  0.9999998  0.99815845]
 [0.9995406  0.99840117 0.99868834 ... 0.9986215  0.99815845 0.99999994]]
Similarity Matrix (Euclidean Distance):
 [[0.         0.13626182 0.11271083 ... 0.12064176 0.13613436 0.0652716 ]
 [0.13626182 0.         0.1458841  ... 0.14207463 0.12318584 0.1216918 ]
 [0.11271083 0.1458841  0.         ... 0.08227803 0.14465623 0.11021324]
 ...
 [0.12064176 0.14207463 0.08227803 ... 0.         0.15182087 0.11303952]
 [0.13613436 0.12318584 0.14465623 ... 0.15182087 0.         0.13082868]
 [0.0652716  0.1216918  0.11021324 ... 0.11303952 0.13082868 0.        ]]
Similarity Matrix (Euclidean Di

# Save Model

In [22]:
np.save(root_url + "/att_outputs/" + str(dataset)+"_att_scores.npy", similarity_matrix_euc)    # attention_scores
np.save(root_url + "/att_outputs/" + str(dataset)+"_att_weights.npy", similarity_matrix_euc_aw)    # attention_scores
np.save(root_url + "/att_outputs/" + str(dataset)+"_att_outputs.npy", similarity_matrix_dotp_ao)  # attention_output

In [14]:
end = time.time()
print(end - start)

30.78857922554016
