In [1]:
from bglog import BGLog, get_embedding_layer
import numpy as np
import tensorflow as tf
tf.random.set_seed(123)

In [2]:
bglog = BGLog(save_padded_num_sequences=False, load_from_pkl=True)

In [3]:
train_test = bglog.get_tensor_train_test(ablation=1000)
train_data, test_data = train_test

padded_num_seq_df loaded from data\bgl_padded_num_seq_df.pkl
trained tokenizer, tk, loaded from data\bgltk.pkl
train_0:, 800
test_0:, 200
train_1:, 800
test_1:, 200
train_2:, 800
test_2:, 200
train_3:, 800
test_3:, 102
4 class does not have 800 records, it has only 628 records
test_4:, 0
5 class does not have 800 records, it has only 165 records
5 class does not have 200 records, it has only 165 records
6 class does not have 800 records, it has only 75 records
6 class does not have 200 records, it has only 75 records
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]]
<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>
<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>


In [4]:
def model(conv1d_set1 = 3, conv1d_set2 = 3, dense_neurons=2048, filters=64,
            kernel_size=3,maxpool_1=True,epochs=25, dense_activation='relu'):
    embedding_weights, vocab_size, char_onehot = get_embedding_layer(bglog)
    B = train_data.element_spec[0].shape[0]
#     inputs = tf.keras.layers.Input(batch_shape=(B, train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
    inputs = tf.keras.layers.Input(shape=(train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
    x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                    output_dim=vocab_size,
                                    input_length=train_data.element_spec[0].shape[2],
                                    weights = [embedding_weights],
                                    )(inputs)
    for _ in range(conv1d_set1):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    if maxpool_1:
        x = tf.keras.layers.MaxPooling2D(pool_size=(1, train_data.element_spec[0].shape[2]))(x)
        x = tf.reshape(x, (B, train_data.element_spec[0].shape[1], filters))        
        for _ in range(conv1d_set2):
            x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )(x)
        x = tf.reshape(x, (B, filters))
    if not maxpool_1:
        x = tf.keras.layers.Flatten()(x)
    if dense_activation is None:
        x = tf.keras.layers.Dense(dense_neurons)(x)
    else:
        x = tf.keras.layers.Dense(dense_neurons, activation=dense_activation)(x)
    outputs = tf.keras.layers.Dense(train_data.element_spec[1].shape[1], activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    print(model.summary())
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    hist = model.fit(train_data, validation_data=test_data, epochs=epochs) 
    return model, hist

In [5]:
# we  feed  xi  to  a dense layer h to get the log-sequence representation zi∈RD:
#     zi= h(xi) =σ(Whxi+bh) ............................(2)
# in our case zi can be obtained from the dense layer before the softmax
# Lets see how to ger it from the train mode

In [6]:
# we pre-train the model with labeled known intent samples. 
# In order to better reflect the effectiveness of the learned decision boundary, 
# we learn the feature representation zi with the simple softmax loss Ls to perform classification:

trained_model, hist = model(epochs=6,)

vocab_size: 50
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32, 64)]          0         
_________________________________________________________________
embedding (Embedding)        (None, 32, 64, 50)        2550      
_________________________________________________________________
conv1d (Conv1D)              (None, 32, 64, 64)        9664      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 32, 64, 64)        12352     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 32, 64, 64)        12352     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 1, 64)         0         
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(32, 32, 

In [7]:
trained_model, hist = model(epochs=6, dense_neurons=64)

vocab_size: 50
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 32, 64)]          0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 64, 50)        2550      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 32, 64, 64)        9664      
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 32, 64, 64)        12352     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 32, 64, 64)        12352     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 1, 64)         0         
_________________________________________________________________
tf_op_layer_Reshape_2 (Tenso [(32, 32, 

In [8]:
# Learn the decision boundary of each class constraining the known labels within a ball area
# how to get zi and how to know that zi belongs to which yi ?
# from there we will have to calculate the Ck , centroid for the class k

In [9]:
trained_model.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x23361f141c0>,
 <tensorflow.python.keras.layers.embeddings.Embedding at 0x23302378d60>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x23302378d00>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x2330237fdf0>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x23302385f40>,
 <tensorflow.python.keras.layers.pooling.MaxPooling2D at 0x23302ad28b0>,
 <tensorflow.python.keras.engine.base_layer.TensorFlowOpLayer at 0x23302af58e0>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x23302ae5940>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x23302af51f0>,
 <tensorflow.python.keras.layers.convolutional.Conv1D at 0x23302afc940>,
 <tensorflow.python.keras.layers.pooling.MaxPooling1D at 0x23302b01ac0>,
 <tensorflow.python.keras.engine.base_layer.TensorFlowOpLayer at 0x23303bd1730>,
 <tensorflow.python.keras.layers.core.Dense at 0x23302b018b0>,
 <tensorflow.python.keras.layers.core.Dense

In [10]:
dense_6 = trained_model.get_layer(index=(len(trained_model.layers)-1))
print(dense_6)

<tensorflow.python.keras.layers.core.Dense object at 0x0000023303BD1790>


In [11]:
#This is the log sequence embedding from the last layer
# we can treat this as the features from the logs
dense_6.output

<tf.Tensor 'dense_3/Softmax:0' shape=(32, 4) dtype=float32>

In [12]:
# Then, we use the pre-trained model to extract intent features for 
# learning the decision boundary

In [13]:
class LogLineEncoder(tf.keras.Model):
    def __init__(self, num_of_conv1d=3,  
                 filters=64,
                 kernel_size=3, ):
        super().__init__()            
        self.num_of_conv1d = num_of_conv1d       
        self.filters = filters
        self.kernel_size = kernel_size           
        self.embedding_weights, self.vocab_size, self.char_onehot = get_embedding_layer(bglog)       
        
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1,
                                    output_dim=self.vocab_size,
                                    input_length=train_data.element_spec[0].shape[2],
                                    weights = [self.embedding_weights],
                                    )
        self.conv1d_layers = [tf.keras.layers.Conv1D(filters=filters, 
                                                kernel_size=kernel_size, 
                                                padding='same')  
                       for _ in range(self.num_of_conv1d)]
        self.maxpool2d = tf.keras.layers.MaxPooling2D(
            pool_size=(1, train_data.element_spec[0].shape[2]))
                  
        
    def call(self, inputs):
        x = self.embedding(inputs)
        for conv1d_layer in self.conv1d_layers:
            x = conv1d_layer(x)
        x = self.maxpool2d(x)
        x = tf.reshape(x, (inputs.shape[0], inputs.shape[1], self.filters))
        return x
    
    

# 
line_encoder =   LogLineEncoder()
# the model doesn't have a state unless it is called at least once
# in order to initialize the model we need a sample data 
sample_train_data = next(iter(train_data))
sample_x_train = sample_train_data[0]
print('sample_x_train.shape:', sample_x_train.shape)
# now we will initialize the model with the sample data
loglineEmbedding = line_encoder(sample_x_train)
print('loglineEmbedding.shape:', loglineEmbedding.shape)
# Now the model have a state and can be inspected        
line_encoder.summary()

vocab_size: 50
sample_x_train.shape: (32, 32, 64)
loglineEmbedding.shape: (32, 32, 64)
Model: "log_line_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  2550      
_________________________________________________________________
conv1d_12 (Conv1D)           multiple                  9664      
_________________________________________________________________
conv1d_13 (Conv1D)           multiple                  12352     
_________________________________________________________________
conv1d_14 (Conv1D)           multiple                  12352     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 multiple                  0         
Total params: 36,918
Trainable params: 36,918
Non-trainable params: 0
_________________________________________________________________


LOG SEQUENCE EMBEDDING TAKES LOGLINE EMBEDDING AS INPUT

In [14]:
class LogSeqEncoder(tf.keras.Model):
    
    def __init__(self, num_of_conv1d=3,  filters=64,
                 kernel_size=3, maxpool_1=True,
                 dense_neurons=16, dense_activation='relu',):
        super().__init__()
        self.num_of_conv1d = num_of_conv1d
        self.dense_neurons = dense_neurons
        self.filters = filters
        self.kernel_size = kernel_size
        self.maxpool_1 = maxpool_1
        self.dense_activation = dense_activation
        self.conv1d_layers = [tf.keras.layers.Conv1D(filters=filters, 
                                                kernel_size=kernel_size, 
                                                padding='same')  
                       for _ in range(self.num_of_conv1d)]
        self.maxpool1d = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )
        
        self.Dense = tf.keras.layers.Dense(self.dense_neurons, 
                                           activation=self.dense_activation)
       
        
    def call(self, inputs):       
        for conv1d_layer in self.conv1d_layers:
            x = conv1d_layer(inputs)
        x = self.maxpool1d(x)        
        x = tf.reshape(x, (inputs.shape[0], self.filters))
        x = self.Dense(x)
        return x
    
    

logSeqencer =   LogSeqEncoder()
# the model doesn't have a state unless it is called at least once
logSeqEmbedding = logSeqencer(loglineEmbedding)
print('logSeqEmbedding.shape:', logSeqEmbedding.shape)
# Now the model have a state and can be inspected        
logSeqencer.summary()

logSeqEmbedding.shape: (32, 16)
Model: "log_seq_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_15 (Conv1D)           multiple                  12352     
_________________________________________________________________
conv1d_16 (Conv1D)           multiple                  12352     
_________________________________________________________________
conv1d_17 (Conv1D)           multiple                  12352     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 multiple                  0         
_________________________________________________________________
dense_4 (Dense)              multiple                  1040      
Total params: 38,096
Trainable params: 38,096
Non-trainable params: 0
_________________________________________________________________


In [15]:
# sample_x_train

In [16]:
class LogClassifier(tf.keras.Model):
    
    def __init__(self,  **kwargs):
        super().__init__(**kwargs)
        self.log_line_encoder = LogLineEncoder()
        self.log_seq_encoder = LogSeqEncoder()
        self.classifier = tf.keras.layers.Dense(
            train_data.element_spec[1].shape[1], activation='softmax')
#         self.extract_feature = extract_feature
    
    def call(self, inputs, extract_feature=False,):
#         x_data, y_data = inputs
        x = self.log_line_encoder(inputs)
        seq_embedding = self.log_seq_encoder(x)
        
        if  extract_feature:
            output = seq_embedding
        else:
            output = self.classifier(seq_embedding)
        return output
    
log_classifier = LogClassifier()
log_classifier(sample_x_train)        

vocab_size: 50


<tf.Tensor: shape=(32, 4), dtype=float32, numpy=
array([[0.35821474, 0.13428831, 0.24254903, 0.26494795],
       [0.35491565, 0.1391793 , 0.24208418, 0.2638209 ],
       [0.375133  , 0.1356046 , 0.21780658, 0.27145582],
       [0.38075638, 0.12354345, 0.21483545, 0.28086472],
       [0.38417214, 0.12913577, 0.21276838, 0.27392364],
       [0.3580804 , 0.13689776, 0.23292485, 0.27209705],
       [0.35545176, 0.13543256, 0.2281103 , 0.28100547],
       [0.39000586, 0.12042741, 0.20709819, 0.28246853],
       [0.3608639 , 0.13162096, 0.23645732, 0.2710579 ],
       [0.35545176, 0.13543256, 0.2281103 , 0.28100547],
       [0.40143406, 0.11165593, 0.20353052, 0.28337947],
       [0.369081  , 0.13193883, 0.2124407 , 0.28653944],
       [0.35913053, 0.137531  , 0.23262696, 0.27071157],
       [0.3781036 , 0.13510987, 0.2082549 , 0.27853167],
       [0.34989867, 0.14130919, 0.22489977, 0.28389236],
       [0.39720216, 0.11340225, 0.2012538 , 0.28814167],
       [0.39713928, 0.10840394, 0.21171

In [17]:
# the classifier assigned low probability to all the classes since it is untrained
# TODO: the mode should accept a single sequence. At present it is accepting only a batch

In [18]:
log_classifier.summary()

Model: "log_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
log_line_encoder_1 (LogLineE multiple                  36918     
_________________________________________________________________
log_seq_encoder_1 (LogSeqEnc multiple                  38096     
_________________________________________________________________
dense_6 (Dense)              multiple                  68        
Total params: 75,082
Trainable params: 75,082
Non-trainable params: 0
_________________________________________________________________


In [19]:
# This is to check that the model's built in  complile and fit is working well
log_classifier.compile(optimizer='adam', 
                  loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = log_classifier.fit(train_data, validation_data=test_data, epochs=1) 

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [20]:
# now after the training the predeicitoin will show higher probability to the 
# a class and lesser probability to other classes
log_classifier(sample_x_train)      

<tf.Tensor: shape=(32, 4), dtype=float32, numpy=
array([[9.83800590e-01, 1.59811135e-02, 7.62605996e-05, 1.42064076e-04],
       [9.81814444e-01, 1.79334246e-02, 9.93448266e-05, 1.52733337e-04],
       [9.96633589e-01, 4.76912275e-04, 1.24792627e-04, 2.76482617e-03],
       [1.63851166e-03, 8.47785413e-05, 1.17509440e-02, 9.86525774e-01],
       [9.96272326e-01, 4.85389668e-04, 1.45744227e-04, 3.09653906e-03],
       [9.85013008e-01, 1.47707565e-02, 7.50642139e-05, 1.41203738e-04],
       [9.94609356e-01, 2.36800150e-03, 1.73490393e-04, 2.84919282e-03],
       [3.56137866e-06, 7.82679650e-04, 9.98905420e-01, 3.08348332e-04],
       [8.14913015e-04, 9.99016285e-01, 1.62149983e-04, 6.53520510e-06],
       [9.94609356e-01, 2.36800150e-03, 1.73490393e-04, 2.84919282e-03],
       [1.89539969e-06, 4.88373160e-04, 9.99277294e-01, 2.32436854e-04],
       [3.45080800e-04, 9.86293708e-06, 1.65420293e-03, 9.97990847e-01],
       [9.83729064e-01, 1.60406213e-02, 8.35043174e-05, 1.46816092e-04],
  

In [21]:
features = log_classifier(sample_x_train, extract_feature=True ) 
print('features.shape:', features.shape)
features[:3]

features.shape: (32, 16)


<tf.Tensor: shape=(3, 16), dtype=float32, numpy=
array([[16.370695  ,  0.        ,  2.057126  ,  4.2832656 ,  0.        ,
         1.3033314 ,  0.31263167,  0.        ,  0.90409607,  0.        ,
         0.35633054,  0.        ,  5.6959033 ,  6.7388763 ,  0.        ,
         0.        ],
       [16.299906  ,  0.        ,  2.1437495 ,  4.3283706 ,  0.        ,
         1.3558165 ,  0.38486403,  0.        ,  1.0134336 ,  0.        ,
         0.45871457,  0.        ,  5.6253996 ,  6.6684337 ,  0.        ,
         0.        ],
       [14.986892  ,  0.        ,  2.2317116 ,  8.941614  ,  0.        ,
         1.4058796 ,  0.        ,  0.        ,  1.7621404 ,  0.        ,
         0.        ,  0.        ,  4.8268437 ,  6.081328  ,  0.        ,
         0.        ]], dtype=float32)>

In [22]:
np.zeros((2, 2))

array([[0., 0.],
       [0., 0.]])

In [23]:
for batch in train_data:
    x_train, y_train = batch
    print(x_train)
    print(y_train)
    break

tf.Tensor(
[[[13 13 13 ... 11 11 25]
  [13 13 13 ... 11 11 25]
  [13 13 13 ... 11 11 25]
  ...
  [13 13 13 ... 11 11 25]
  [13 13 22 ... 11 11 25]
  [13 13 22 ... 11 11 25]]

 [[ 2 19 11 ...  8 10  2]
  [ 2 19 11 ...  8 10  2]
  [ 2 19 11 ...  8 10  2]
  ...
  [ 2 19 11 ...  8 10  2]
  [ 2 19 11 ...  8 10  2]
  [ 2 19 11 ...  8 10  2]]

 [[18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  ...
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]]

 ...

 [[10  5 14 ... 11 14 23]
  [10  5 14 ... 23 10 28]
  [10  5 14 ... 22 10 13]
  ...
  [22 30 17 ...  8  4 14]
  [12  3  6 ... 12  6 18]
  [ 4 22 30 ...  9  0  0]]

 [[12  3  6 ... 12  6 18]
  [ 4 30 30 ...  9  0  0]
  [30 30 17 ...  8  4 14]
  ...
  [30 30 17 ...  8  4 14]
  [12  3  6 ... 12  6 18]
  [ 4 30 30 ...  9  0  0]]

 [[18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  ...
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]]]

In [24]:
centroids = np.zeros((train_data.element_spec[1].shape[1],   16))
print('centriods initialized:', centroids)
total_labels = np.zeros(train_data.element_spec[1].shape[1]) # it was 4
# total_labels[2] += 1
# total_labels[2] += 1
print('total_labels initialized:', total_labels)
for batch in train_data: # Remember <BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>
    logseq_batch, label_batch = batch
    # (32, 32, 64), (32, 4)
    features = log_classifier(logseq_batch, extract_feature=True )
    # (32, 16) features - 32 sequence of line each haaving 64 characrers
    # produces a feaure vector of dimension 16. 
    for i in range(len(label_batch)): # (32, 4) --> here length is 32
        label = label_batch[i] # label looks like [0 0 0 1]
        numeric_label = np.argmax(label) # index position of the label = 3 , so it is actually class =3
        ##total_labels = [0 0 0 0] each col representing a class 
        ## count the number for each class
        total_labels[numeric_label] += 1 
        centroids[numeric_label] += features[i] 
        # each row index in the centroid array is a class
        # we add first identify the feature belonging to which class by the numeric_label
        # Then add all the features belonging to the class in the corresponding row of the centroid array

### shape of centroids is (4, 16) whereas shape of total_labels is (1, 4)
### reshape the total_labels as 4,1 ==> [[0], [0], [0], [0]]==> 4 rows 
## so that we can divide the centroids array by the total_labels
total_label_reshaped = np.reshape(total_labels, (train_data.element_spec[1].shape[1], 1))
centroids /= total_label_reshaped
print('centroids:',centroids)
print('total_labels:',total_label_reshaped)

centriods initialized: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
total_labels initialized: [0. 0. 0. 0.]
centroids: [[1.59228516e+01 0.00000000e+00 2.11870209e+00 5.98598145e+00
  0.00000000e+00 1.25047585e+00 2.19539013e-01 0.00000000e+00
  1.56062897e+00 0.00000000e+00 2.68166313e-01 0.00000000e+00
  5.16910828e+00 6.25313660e+00 0.00000000e+00 0.00000000e+00]
 [1.74468457e+01 0.00000000e+00 3.22822418e-01 2.02313873e+00
  0.00000000e+00 3.26722992e+00 2.75704742e+00 0.00000000e+00
  8.89736511e+00 0.00000000e+00 4.26681946e+00 0.00000000e+00
  1.02011330e+00 2.11967209e+00 0.00000000e+00 0.00000000e+00]
 [1.02403418e+01 0.00000000e+00 3.78646240e+00 1.14958667e+01
  0.00000000e+00 1.84005249e+00 2.75890717e+00 0.00000000e+00
  1.34137109e+01 0.00000000e+00 6.58589233e+00 0.00000000e+00
  2.07575560e-02 1.36120820e-02 0.000

In [25]:
# to understand the np divide operation
n1 = np.arange(12)
n2 = n1.reshape((3, 4))
print('n2 is 3 rows and each row has a vector of 4 cols:\n', n2)
n3 = np.array([2, 2, 2])
print('n3, is one single row:\n ', n3)
n4 = np.reshape(n3, (3, 1))
print('one single row is now converted to three rows, n4:\n', n4)
print(f'now division between n2: {n2.shape} and n4: {n4.shape} is possible')
n2/n4

n2 is 3 rows and each row has a vector of 4 cols:
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
n3, is one single row:
  [2 2 2]
one single row is now converted to three rows, n4:
 [[2]
 [2]
 [2]]
now division between n2: (3, 4) and n4: (3, 1) is possible


array([[0. , 0.5, 1. , 1.5],
       [2. , 2.5, 3. , 3.5],
       [4. , 4.5, 5. , 5.5]])

In [26]:
#take zi and a ck 
# from sample_x_train the first sample belongs to class 3
print('sample_x_train[0]', sample_x_train[0])
sample_y_train = sample_train_data[1]
print('sample_y_train[0]', sample_y_train[0])
print('feature for the same:', features[0])
print('centroid for the class 3 :', centroids[3])

sample_x_train[0] tf.Tensor(
[[18  2  4 ...  0  0  0]
 [18  2  4 ...  0  0  0]
 [18  2  4 ...  0  0  0]
 ...
 [18  2  4 ...  0  0  0]
 [18  2  4 ...  0  0  0]
 [18  2  4 ...  0  0  0]], shape=(32, 64), dtype=int32)
sample_y_train[0] tf.Tensor([1. 0. 0. 0.], shape=(4,), dtype=float32)
feature for the same: tf.Tensor(
[10.185282   0.         3.8696542 12.428818   0.         1.611328
  2.503866   0.        14.205777   0.         6.697892   0.
  0.         0.         0.         0.       ], shape=(16,), dtype=float32)
centroid for the class 3 : [11.13327759  0.          0.43634995 14.62334229  0.          0.86098663
  0.1899966   0.         10.60090942  0.          1.73968216  0.
  0.03390059  0.1144355   0.          0.        ]


In [27]:
# eucladian distance 
#first sample belonging to class 3 = z_0_3, dimension of it is same as the dense neuron=16
z_0_3= features[0] # [16], earlier [2048]
C_3 = centroids[3] # [16], earlier [2048]
ED = np.sum(np.square(z_0_3 - C_3 ))
print('eucladian distance:', ED)
# InvalidArgumentError: Incompatible shapes: [32,64] vs. [2048] [Op:Sub]

eucladian distance: 61.01238


In [28]:
# lets f1 is one feature from a batch of 32
# instead of 16, lets say the dimension of f1 is 5
f1 = np.arange(20, 25)
# a1 = np.reshape(a1, (4, 3) )
print('one feature with dimension 5 , f1:\n', f1)
### lets say ctd is the centroids with 4 class , each row is a class 
ctd = np.arange(20)
ctd = np.reshape(ctd, (4, 5))
print('centroid, ctd with 4 class :\n', ctd)
sub_result = f1 - ctd
print()
print('sub_result, for each feature we have 4 rows after substraction:\n', sub_result)
print('the 4 rows of the result are the distance of the feature from the centroid of each class')
for i, row in enumerate(sub_result):
    print(f'distance of f1 from centroid of class_{i}: {row}')

one feature with dimension 5 , f1:
 [20 21 22 23 24]
centroid, ctd with 4 class :
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]

sub_result, for each feature we have 4 rows after substraction:
 [[20 20 20 20 20]
 [15 15 15 15 15]
 [10 10 10 10 10]
 [ 5  5  5  5  5]]
the 4 rows of the result are the distance of the feature from the centroid of each class
distance of f1 from centroid of class_0: [20 20 20 20 20]
distance of f1 from centroid of class_1: [15 15 15 15 15]
distance of f1 from centroid of class_2: [10 10 10 10 10]
distance of f1 from centroid of class_3: [5 5 5 5 5]


In [29]:
# to get the eucladian distance, calculate squared substraction
squared_substraction = np.square(sub_result)
print('squared_substraction:\n', squared_substraction)
#### eucladian distance of f1 from each class represented by a vector of 5 dimension
### those 5 values from the vector is summed up tp get a scalar value 
sum_squared_substraction = np.sum(np.square(sub_result), axis=1)
print('sum_squared_substraction:\n', sum_squared_substraction )

squared_substraction:
 [[400 400 400 400 400]
 [225 225 225 225 225]
 [100 100 100 100 100]
 [ 25  25  25  25  25]]
sum_squared_substraction:
 [2000 1125  500  125]


In [30]:
### the abobe example is for a single feature , how to do this for all the features
# can we substract centroids array from the entire feature array? 
# we can not substract different shaped arrays , see the error
print('features.shape', features.shape) # ## features.shape (32, 16)
print('centroids.shape', centroids.shape)  ##centroids.shape (4, 16)
# features - centroids # InvalidArgumentError: Incompatible shapes: [32,16] vs. [4,16] [Op:Sub]
#### substraction is elementwise substraction. so we both the array have to equal shape
z = np.expand_dims(features, axis=1) ## (32, 1, 16)
C =  np.expand_dims(centroids, axis=0) ### (1, 4, 16)
print('np.expand_dims(features, axis=1) :', z.shape)
print('np.expand_dims(centroids, axis=0):', C.shape)
# print('first dimenstion of C:',C[0])
# Now we can substract
sub_z_C = z - C
print('sub_z_C , for each feature 4 results:', sub_z_C.shape)
squred_sum = np.sum(np.square(z- C), axis=2)
print('squred_sum', squred_sum.shape)
print('Eucaldian distance of first feature from the 4 classes:\n',squred_sum[0])

features.shape (32, 16)
centroids.shape (4, 16)
np.expand_dims(features, axis=1) : (32, 1, 16)
np.expand_dims(centroids, axis=0): (1, 4, 16)
sub_z_C , for each feature 4 results: (32, 4, 16)
squred_sum (32, 4)
Eucaldian distance of first feature from the 4 classes:
 [349.90658958 216.0175884    1.6382401   61.01237844]


In [31]:
# from the data iteration done beforem we will have the last features and the last label_batch
## lets see what is the label for the 0th element in that batch
print('0th element from the last bath retrieved from data iteration previously:\n', features[0])
print('label for the feature_0:\n', label_batch[0])

0th element from the last bath retrieved from data iteration previously:
 tf.Tensor(
[10.185282   0.         3.8696542 12.428818   0.         1.611328
  2.503866   0.        14.205777   0.         6.697892   0.
  0.         0.         0.         0.       ], shape=(16,), dtype=float32)
label for the feature_0:
 tf.Tensor([0. 0. 1. 0.], shape=(4,), dtype=float32)


In [32]:
# Here eucladian distance of the first feature is least from the class_0 

In [33]:
# make the dimensions same for substraction
def euclidean_metric(a, b):
    a = np.expand_dims(a, 1)
    b = np.expand_dims(b, 0)
#     logits = -((a - b)**2).sum(dim=2)
    logits = np.sum(-np.square(a - b), axis=2)
    return logits  

ED_logits = euclidean_metric(features, centroids)
print('ED_logits', ED_logits.shape)
print('ED_logits_sample', ED_logits[0])

ED_logits (32, 4)
ED_logits_sample [-349.90658958 -216.0175884    -1.6382401   -61.01237844]


In [34]:
### we got the same result only in negative sign 
# these four eucladian values can be taken as softmax 
# to convert it as probability amonghst the four
# Then the max value ( that is why -ve sign will help)
# will represent the class with highest probability

In [35]:
t = tf.range(10)
t = tf.reshape(t, (2, 5))
print(t)
tf.argmax(t, axis=1)

tf.Tensor(
[[0 1 2 3 4]
 [5 6 7 8 9]], shape=(2, 5), dtype=int32)


<tf.Tensor: shape=(2,), dtype=int64, numpy=array([4, 4], dtype=int64)>

In [36]:
# can we get the max value instead of the index
tf.reduce_max(t, 1)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 9])>

In [37]:
# smax = tf.nn.softmax(ED_logits, axis=1)
smax = tf.nn.softmax(ED_logits, )
print('smax.shape:', smax.shape)
class_idx_having_minimum_distance = tf.math.argmax(smax, axis=1)
for i in range(5):    
    print('smax_sample:', smax[i].numpy())
    print('class_idx_having_minimum_distance:', class_idx_having_minimum_distance[i].numpy())

smax.shape: (32, 4)
smax_sample: [5.61019037e-152 7.87466337e-094 1.00000000e+000 1.63734180e-026]
class_idx_having_minimum_distance: 2
smax_sample: [8.85826821e-118 1.78730061e-116 8.36417246e-037 1.00000000e+000]
class_idx_having_minimum_distance: 3
smax_sample: [6.44885863e-48 1.00000000e+00 5.66201308e-63 3.87784564e-73]
class_idx_having_minimum_distance: 1
smax_sample: [1.00000000e+000 2.11417181e-069 9.14938496e-117 1.24417711e-078]
class_idx_having_minimum_distance: 0
smax_sample: [1.08049842e-123 9.96291698e-070 1.00000000e+000 2.96436126e-028]
class_idx_having_minimum_distance: 2


In [38]:
print('label for the feature_0:\n', label_batch[0])
label_indexs = np.argmax(label_batch, axis=1)
print('label_indexs.shape', label_indexs.shape)
print('sample label_indexes',label_indexs[:5])
print('centroids.shape:', centroids.shape)
c = centroids[label_indexs]
print('c.shape:', c.shape)
print(c[0])

label for the feature_0:
 tf.Tensor([0. 0. 1. 0.], shape=(4,), dtype=float32)
label_indexs.shape (32,)
sample label_indexes [2 3 1 0 2]
centroids.shape: (4, 16)
c.shape: (32, 16)
[10.2403418   0.          3.7864624  11.4958667   0.          1.84005249
  2.75890717  0.         13.41371094  0.          6.58589233  0.
  0.02075756  0.01361208  0.          0.        ]


In [39]:
### To understand this lets take simple example
a = np.arange(12).reshape((4, 3))
print('a:\n',a)
# print(a.shape)
print('a[0]:\n',a[0])
print()
# we create a index whose dimension is higher than a 
b = np.tile([0, 1, 2, 3], (4))
print('b.shape:\n',b.shape)
# b = 16 but a is having 4 rows
print('b:\n',b)
#Notice that each element of b is wihin 0-3 , matching with the max row of a.
# the slicing can be done now as:
print()
print(a[b])

a:
 [[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
a[0]:
 [0 1 2]

b.shape:
 (16,)
b:
 [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


In [41]:
# remeber c = centroids[label_indexs]
# only the centroid of the class corresponding to the feature
# earlier we had to expand dimension becuase centroid contained all the 
# four classes
dis = features - c 
print(dis.shape)
euc_dis = tf.norm(features - c, ord='euclidean', axis=1,)
print('euc_dis', euc_dis.shape)
for d, ed in zip(dis[:5], euc_dis[:5]):
    print(f'distance: {d.numpy()}\n eucladian distance:{ed.numpy()}\n')

(32, 16)
euc_dis (32,)
distance: [-0.05506039  0.          0.08319187  0.932951    0.         -0.22872448
 -0.25504112  0.          0.7920666   0.          0.11199999  0.
 -0.02075756 -0.01361208  0.          0.        ]
 eucladian distance:1.2799376249313354

distance: [-0.07839584  0.         -0.43634996  1.3717766   0.         -0.13542652
 -0.1899966   0.         -0.2310648   0.         -0.8690177   0.
 -0.03390059 -0.1144355   0.          0.        ]
 eucladian distance:1.7191815376281738

distance: [-3.9668312   0.         -0.30758154 -0.28688192  0.         -1.0577219
 -0.8485044   0.         -1.6614985   0.         -0.6632192   0.
 -0.45969117 -1.9281311   0.          0.        ]
 eucladian distance:4.988069534301758

distance: [-0.92845726  0.          0.15833092  2.9910045   0.          0.09033704
 -0.21953902  0.          0.22963023  0.         -0.2681663   0.
 -0.36125326 -0.16652346  0.          0.        ]
 eucladian distance:3.1894264221191406

distance: [-1.248992    0. 

In [None]:
class BoundaryLoss(tf.keras.layers.Layer):
    def __init__(self, num_labels=train_data.element_spec[1].shape[1], 
                feat_dim = 16):
        super().__init__()
        self.num_labels = num_labels
        self.feat_dim = feat_dim
        # initializing the delta boundary 
        w_init = tf.random_normal_initializer()
        self.delta = tf.Variable(
            initial_value=w_init((self.num_labels), dtype="float32"),
            trainable=True,
        )
        
    def call(self, features, centroids, labels):
        logits =  euclidean_metric(features, centroids)  
        ######### Why softmax before softplus#########
        smax = tf.nn.softmax(logits, )
        # this is equivallent to predicting the feature belong to which class
        preds = tf.math.argmax(smax, axis=1)
        # This is equivallent to obtaining the max probabiliy of a feature belonging to a calss
        probs = tf.reduce_max(smax, 1)        
        ############################
        # delta =  log(1 + e ^ delta_k) , delta_k =self.delta = parameters for the boundary
        delta = tf.nn.softplus(self.delta)  
        label_indexs = np.argmax(label_batch, axis=1)
        # centroids are having only 4 rows , whereas labels are rows equivallent to batch
        # pick-up the centroid for each class 
        # label_index from the data set will have all the classes, 32 for a batch
        # for each class cetroid[class_index] will give the centroid of the calss
        # it is basically : [centroids[class_idx] for class_idx in label_indexes]
        c = centroids[label_indexs]
        # similarly get the delta for each class, 
        # although delta is now randomly intialized 
        # delta parameters will be learned through the training
        d = delta[label_indexs]
        x = features
        # x-c = vector of (32, 16) dimension , euc_dis  = scalar value
        euc_dis = tf.norm(x - c, ord='euclidean', axis=1)        
        ##If axis is None (the default), the input is considered a vector and a 
        ## single vector norm is computed over the entire set of values in the tensor, 
        ## i.e. norm(tensor, ord=ord) is equivalent to norm(reshape(tensor, [-1]), ord=ord). 
        ##If axis is a Python integer, the input is considered a batch of vectors, and axis determines the axis in tensor over which to compute vector norms.
        pos_mask = euc_dis > d
        neg_mask = euc_dis < d
        
        pos_loss = (euc_dis - d) * pos_mask
        neg_loss = (d - euc_dis) * neg_mask
        loss = pos_loss.mean() + neg_loss.mean()
        
        return loss, delta

In [None]:
# understand this , a= features(batch_size, 2048) , b = centroids (4, 2048)
def euclidean_metric_torch(a, b):
    n = a.shape[0]
    m = b.shape[0]
    a = a.unsqueeze(1).expand(n, m, -1)
    b = b.unsqueeze(0).expand(n, m, -1)
    logits = -((a - b)**2).sum(dim=2)
    return logits

In [None]:
import numpy as np
import tensorflow as tf
a = np.arange(6)
a = a.reshape((2, -1))
print('a:', a)
print('a.shape', a.shape)
b = np.arange(8, 16)
print('b',b)
b = np.reshape(b, (4, -1))
print('b',b)
print('b.shape:',b.shape)
tfa = tf.constant(a)
tfb = tf.constant(b)
print('tfa',tfa)
print('tfb',tfb)
# n = tfa.shape[0]
# m = b.shape[0]


In [None]:
print('tf.expand_dims(tfa, 0) :',tf.expand_dims(tfa, 0))
print()
print('tf.expand_dims(tfa, 1) :',tf.expand_dims(tfa, 1))
print()
print('tf.expand_dims(tfa, 1) :',tf.expand_dims(tfa, -1))

In [None]:
tfa = tf.expand_dims(tfa, 1)
print(f'tf.shape(tfa): {tf.shape(tfa)}')
tfb = tf.expand_dims(tfb, 0)
print(f'tf.shape(tfb): {tf.shape(tfb)}')

In [None]:
n = a.shape[0]
m = b.shape[0]
tfa_broadcast = tf.broadcast_to(tfa, [2, 4, 3])
tf.shape(tfa_broadcast)
print('tfa_broadcast',tfa_broadcast)

In [None]:
# 6

# The equivalent function for pytorch expand is tensorflow tf.broadcast_to

# Docs: https://www.tensorflow.org/api_docs/python/tf/broadcast_to

# Share
# Follow
# edited Oct 23, 2021 at 18:22

# M.Innat
# 12.2k66 gold badges3434 silver badges6767 bronze badges
# answered Jan 4, 2019 at 9:12

# funkyyyyyy
# 6111 silver badge22 bronze badges
# Add a comment

# 0

# Tensorflow automatically broadcasts, so in general you don't need to do any of this. Suppose you have a y' of shape 6x2x3 and your x is of shape 2x3, then you can already do y'*x or y'+x will already behave as if you had expanded it. But if for some other reason you really need to do it, then the command in tensorflow is tile:

# y = tf.tile(tf.reshape(x, (1,2,3)), multiples=(6,1,1))
# Docs: https://www.tensorflow.org/api_docs/python/tf/tile

In [None]:
def euclidean_metric(a, b):
    n = a.shape[0]
    m = b.shape[0]
    a = tf.expand_dims(a, 1)
    b = tf.expand_dims(b, 0)
    logits = -((a - b)**2).sum(dim=2)
    return logits  

In [None]:
class OpenSet:
    def __init__(self, data, pretrained_model):
        
        self.model = pretrained_model
        self.best_eval_score = 0
        self.delta = None
        self.delta_points = []
        self.centroids = None
        self.test_results = None
        self.predictions = None
        self.true_labels = None
        
    def centroids_cal(self):
        centriods = np.zeros(train_data.element_spec[1].shape[1], embedding_size)
        total_labels = np.empty(0, dtype=longdouble)
        


In [None]:
# Customizing the training step to get centroid for each class
class OpenSet:
    def __init__(self, data, pretrained_model=log_classifier):
#         super().__init__():
        self.model = pretrained_model        
        self.centroids = None
        self.num_labels = train_data.element_spec[1].shape[1]
        
    def centroids_cal(self):
        centriods = np.zeros(self.num_labels, embedding_size)
        total_labels = np.empty(0, dtype=longdouble)
        for batch in data:
            logseq_batch, label_batch = batch
            features = self.model(logseq_batch, extract_feature=True ) 
            

In [None]:
# In context of deep learning the logits layer means the layer that feeds in to softmax (or other such normalization). The output of the softmax are the probabilities for the classification task and its input is logits layer. The logits layer typically produces values from -infinity to +infinity and the softmax layer transforms it to values from 0 to 1.

# Historical Context

# Where does this term comes from? In 1930s and 40s, several people were trying to adapt linear regression to the problem of predicting probabilities. However linear regression produces output from -infinity to +infinity while for probabilities our desired output is 0 to 1. One way to do this is by somehow mapping the probabilities 0 to 1 to -infinity to +infinity and then use linear regression as usual. One such mapping is cumulative normal distribution that was used by Chester Ittner Bliss in 1934 and he called this "probit" model, short for "probability unit". However this function is computationally expensive while lacking some of the desirable properties for multi-class classification. In 1944 Joseph Berkson used the function log(p/(1-p)) to do this mapping and called it logit, short for "logistic unit". The term logistic regression derived from this as well.

# The Confusion

# Unfortunately the term logits is abused in deep learning. From pure mathematical perspective logit is a function that performs above mapping. In deep learning people started calling the layer "logits layer" that feeds in to logit function. Then people started calling the output values of this layer "logit" creating the confusion with logit the function.