In [1]:
from bglog import BGLog, get_embedding_layer
import numpy as np
import tensorflow as tf
tf.random.set_seed(123)

In [2]:
bglog = BGLog(save_padded_num_sequences=False, load_from_pkl=True)

In [3]:
train_test = bglog.get_tensor_train_test(ablation=1000)
train_data, test_data = train_test

padded_num_seq_df loaded from data\bgl_padded_num_seq_df.pkl
trained tokenizer, tk, loaded from data\bgltk.pkl
train_0:, 800
test_0:, 200
train_1:, 800
test_1:, 200
train_2:, 800
test_2:, 200
train_3:, 800
test_3:, 102
4 class does not have 800 records, it has only 628 records
test_4:, 0
5 class does not have 800 records, it has only 165 records
5 class does not have 200 records, it has only 165 records
6 class does not have 800 records, it has only 75 records
6 class does not have 200 records, it has only 75 records
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]]
<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>
<BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>


In [4]:
def model(conv1d_set1 = 3, conv1d_set2 = 3, dense_neurons=2048, filters=64,
            kernel_size=3,maxpool_1=True,epochs=25, dense_activation='relu'):
    embedding_weights, vocab_size, char_onehot = get_embedding_layer(bglog)
    B = train_data.element_spec[0].shape[0]
#     inputs = tf.keras.layers.Input(batch_shape=(B, train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
    inputs = tf.keras.layers.Input(shape=(train_data.element_spec[0].shape[1], train_data.element_spec[0].shape[2]), dtype='float64' )
    x = tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                    output_dim=vocab_size,
                                    input_length=train_data.element_spec[0].shape[2],
                                    weights = [embedding_weights],
                                    )(inputs)
    for _ in range(conv1d_set1):
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
    if maxpool_1:
        x = tf.keras.layers.MaxPooling2D(pool_size=(1, train_data.element_spec[0].shape[2]))(x)
        x = tf.reshape(x, (B, train_data.element_spec[0].shape[1], filters))        
        for _ in range(conv1d_set2):
            x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding='same')(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )(x)
        x = tf.reshape(x, (B, filters))
    if not maxpool_1:
        x = tf.keras.layers.Flatten()(x)
    if dense_activation is None:
        x = tf.keras.layers.Dense(dense_neurons)(x)
    else:
        x = tf.keras.layers.Dense(dense_neurons, activation=dense_activation)(x)
    outputs = tf.keras.layers.Dense(train_data.element_spec[1].shape[1], activation='softmax')(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    print(model.summary())
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    hist = model.fit(train_data, validation_data=test_data, epochs=epochs) 
    return model, hist

In [5]:
# we  feed  xi  to  a dense layer h to get the log-sequence representation zi∈RD:
#     zi= h(xi) =σ(Whxi+bh) ............................(2)
# in our case zi can be obtained from the dense layer before the softmax
# Lets see how to ger it from the train mode

In [8]:
# we pre-train the model with labeled known intent samples. 
# In order to better reflect the effectiveness of the learned decision boundary, 
# we learn the feature representation zi with the simple softmax loss Ls to perform classification:

# trained_model, hist = model(epochs=6,)

In [7]:
# trained_model, hist = model(epochs=6, dense_neurons=64)

In [8]:
# Learn the decision boundary of each class constraining the known labels within a ball area
# how to get zi and how to know that zi belongs to which yi ?
# from there we will have to calculate the Ck , centroid for the class k

In [6]:
# trained_model.layers

In [10]:
# dense_6 = trained_model.get_layer(index=(len(trained_model.layers)-1))
# print(dense_6)

In [11]:
#This is the log sequence embedding from the last layer
# we can treat this as the features from the logs
# dense_6.output

In [12]:
# Then, we use the pre-trained model to extract intent features for 
# learning the decision boundary

In [12]:
class LogLineEncoder(tf.keras.Model):
    def __init__(self, num_of_conv1d=3,  
                 filters=64,
                 kernel_size=3, ):
        super().__init__()            
        self.num_of_conv1d = num_of_conv1d       
        self.filters = filters
        self.kernel_size = kernel_size           
        self.embedding_weights, self.vocab_size, self.char_onehot = get_embedding_layer(bglog)       
        
        self.embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size+1,
                                    output_dim=self.vocab_size,
                                    input_length=train_data.element_spec[0].shape[2],
                                    weights = [self.embedding_weights],
                                    )
        self.conv1d_layers = [tf.keras.layers.Conv1D(filters=filters, 
                                                kernel_size=kernel_size, 
                                                padding='same')  
                       for _ in range(self.num_of_conv1d)]
        self.maxpool2d = tf.keras.layers.MaxPooling2D(
            pool_size=(1, train_data.element_spec[0].shape[2]))
                  
        
    def call(self, inputs):
        x = self.embedding(inputs)
        for conv1d_layer in self.conv1d_layers:
            x = conv1d_layer(x)
        x = self.maxpool2d(x)
        x = tf.reshape(x, (inputs.shape[0], inputs.shape[1], self.filters))
        return x
    
    

# 
line_encoder =   LogLineEncoder()
# the model doesn't have a state unless it is called at least once
# in order to initialize the model we need a sample data 
sample_train_data = next(iter(train_data))
sample_x_train = sample_train_data[0]
print('sample_x_train.shape:', sample_x_train.shape)
# now we will initialize the model with the sample data
loglineEmbedding = line_encoder(sample_x_train)
print('loglineEmbedding.shape:', loglineEmbedding.shape)
# Now the model have a state and can be inspected        
line_encoder.summary()

vocab_size: 50
sample_x_train.shape: (32, 32, 64)
loglineEmbedding.shape: (32, 32, 64)
Model: "log_line_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  2550      
_________________________________________________________________
conv1d_6 (Conv1D)            multiple                  9664      
_________________________________________________________________
conv1d_7 (Conv1D)            multiple                  12352     
_________________________________________________________________
conv1d_8 (Conv1D)            multiple                  12352     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 multiple                  0         
Total params: 36,918
Trainable params: 36,918
Non-trainable params: 0
_________________________________________________________________


LOG SEQUENCE EMBEDDING TAKES LOGLINE EMBEDDING AS INPUT

In [13]:
class LogSeqEncoder(tf.keras.Model):
    
    def __init__(self, num_of_conv1d=3,  filters=64,
                 kernel_size=3, maxpool_1=True,
                 dense_neurons=16, dense_activation='relu',):
        super().__init__()
        self.num_of_conv1d = num_of_conv1d
        self.dense_neurons = dense_neurons
        self.filters = filters
        self.kernel_size = kernel_size
        self.maxpool_1 = maxpool_1
        self.dense_activation = dense_activation
        self.conv1d_layers = [tf.keras.layers.Conv1D(filters=filters, 
                                                kernel_size=kernel_size, 
                                                padding='same')  
                       for _ in range(self.num_of_conv1d)]
        self.maxpool1d = tf.keras.layers.MaxPooling1D(pool_size=(train_data.element_spec[0].shape[1]) )
        
        self.Dense = tf.keras.layers.Dense(self.dense_neurons, 
                                           activation=self.dense_activation)
       
        
    def call(self, inputs):       
        for conv1d_layer in self.conv1d_layers:
            x = conv1d_layer(inputs)
        x = self.maxpool1d(x)        
        x = tf.reshape(x, (inputs.shape[0], self.filters))
        x = self.Dense(x)
        return x
    
    

logSeqencer =   LogSeqEncoder()
# the model doesn't have a state unless it is called at least once
logSeqEmbedding = logSeqencer(loglineEmbedding)
print('logSeqEmbedding.shape:', logSeqEmbedding.shape)
# Now the model have a state and can be inspected        
logSeqencer.summary()

logSeqEmbedding.shape: (32, 16)
Model: "log_seq_encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            multiple                  12352     
_________________________________________________________________
conv1d_10 (Conv1D)           multiple                  12352     
_________________________________________________________________
conv1d_11 (Conv1D)           multiple                  12352     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 multiple                  0         
_________________________________________________________________
dense_2 (Dense)              multiple                  1040      
Total params: 38,096
Trainable params: 38,096
Non-trainable params: 0
_________________________________________________________________


In [15]:
# sample_x_train

In [14]:
class LogClassifier(tf.keras.Model):
    
    def __init__(self,  **kwargs):
        super().__init__(**kwargs)
        self.log_line_encoder = LogLineEncoder()
        self.log_seq_encoder = LogSeqEncoder()
        self.classifier = tf.keras.layers.Dense(
            train_data.element_spec[1].shape[1], activation='softmax')
#         self.extract_feature = extract_feature
    
    def call(self, inputs, extract_feature=False,):
#         x_data, y_data = inputs
        x = self.log_line_encoder(inputs)
        seq_embedding = self.log_seq_encoder(x)
        
        if  extract_feature:
            output = seq_embedding
        else:
            output = self.classifier(seq_embedding)
        return output
    
log_classifier = LogClassifier()
log_classifier(sample_x_train)        

vocab_size: 50


<tf.Tensor: shape=(32, 4), dtype=float32, numpy=
array([[0.33716607, 0.24919626, 0.05442087, 0.35921675],
       [0.2766537 , 0.29489642, 0.08262176, 0.34582812],
       [0.3199095 , 0.25782165, 0.05833273, 0.36393607],
       [0.28731778, 0.28267062, 0.07286509, 0.35714644],
       [0.3355232 , 0.24599618, 0.0549023 , 0.36357826],
       [0.31767663, 0.2631231 , 0.07114722, 0.34805304],
       [0.31268576, 0.2628748 , 0.08044287, 0.3439965 ],
       [0.32523376, 0.26528654, 0.06874856, 0.3407311 ],
       [0.32288972, 0.25911242, 0.06923639, 0.34876144],
       [0.30879265, 0.2646237 , 0.06299507, 0.36358863],
       [0.3413578 , 0.25154853, 0.06545333, 0.34164026],
       [0.33662525, 0.26084292, 0.06374656, 0.33878526],
       [0.3221508 , 0.2550465 , 0.05469545, 0.36810726],
       [0.30857155, 0.27156097, 0.06484332, 0.35502425],
       [0.30648598, 0.27784422, 0.08734404, 0.32832575],
       [0.30743182, 0.25875893, 0.09034371, 0.34346563],
       [0.32990924, 0.25099352, 0.06788

In [17]:
# the classifier assigned low probability to all the classes since it is untrained
# TODO: the mode should accept a single sequence. At present it is accepting only a batch

In [15]:
log_classifier.summary()

Model: "log_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
log_line_encoder_1 (LogLineE multiple                  36918     
_________________________________________________________________
log_seq_encoder_1 (LogSeqEnc multiple                  38096     
_________________________________________________________________
dense_4 (Dense)              multiple                  68        
Total params: 75,082
Trainable params: 75,082
Non-trainable params: 0
_________________________________________________________________


In [16]:
# This is to check that the model's built in  complile and fit is working well
log_classifier.compile(optimizer='adam', 
                  loss='categorical_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
hist = log_classifier.fit(train_data, validation_data=test_data, epochs=1) 

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


In [17]:
# now after the training the predeicitoin will show higher probability to the 
# a class and lesser probability to other classes
log_classifier(sample_x_train)      

<tf.Tensor: shape=(32, 4), dtype=float32, numpy=
array([[9.66351145e-05, 7.65195466e-04, 1.66367379e-03, 9.97474492e-01],
       [9.95750427e-01, 1.40356628e-04, 1.36533954e-05, 4.09561070e-03],
       [4.77931986e-04, 1.08113000e-02, 6.67734802e-01, 3.20975959e-01],
       [2.47823641e-06, 2.55597918e-03, 9.96359289e-01, 1.08222221e-03],
       [5.38666609e-05, 7.22623023e-04, 1.18066545e-03, 9.98042822e-01],
       [9.97909129e-01, 3.12602438e-04, 7.11705434e-06, 1.77105644e-03],
       [9.97776687e-01, 3.22203268e-04, 7.48476623e-06, 1.89362431e-03],
       [9.97942388e-01, 3.10492615e-04, 6.96395318e-06, 1.74007216e-03],
       [2.20142087e-04, 9.93577302e-01, 3.21398443e-03, 2.98853219e-03],
       [5.19183013e-05, 4.60706447e-04, 4.09301720e-04, 9.99078035e-01],
       [4.43612407e-05, 9.89452481e-01, 4.11756523e-03, 6.38550986e-03],
       [2.59489661e-05, 8.04625452e-03, 9.89914417e-01, 2.01345026e-03],
       [3.01148463e-03, 8.14558007e-03, 1.03721686e-01, 8.85121226e-01],
  

In [18]:
features = log_classifier(sample_x_train, extract_feature=True ) 
print('features.shape:', features.shape)
features[:3]

features.shape: (32, 16)


<tf.Tensor: shape=(3, 16), dtype=float32, numpy=
array([[ 1.9089504 ,  0.        ,  0.        ,  0.        , 16.485287  ,
         1.401845  ,  0.        ,  0.5729957 ,  3.2841873 ,  5.3220925 ,
         0.        ,  0.        ,  0.        ,  0.        , 11.831296  ,
         0.        ],
       [ 3.7322652 ,  0.        ,  1.0973142 ,  0.        ,  7.661072  ,
         0.        ,  2.5758548 ,  0.5659268 ,  4.8668857 ,  1.4417913 ,
         0.        ,  4.580169  ,  0.        ,  0.        , 15.364222  ,
         0.        ],
       [ 0.28400064,  0.        ,  0.        ,  0.        , 15.390482  ,
         3.2594252 ,  1.3286173 ,  0.        ,  0.7812477 ,  3.0938601 ,
         0.        ,  0.        ,  0.        ,  0.        , 13.000235  ,
         0.        ]], dtype=float32)>

In [19]:
np.zeros((2, 2))

array([[0., 0.],
       [0., 0.]])

In [20]:
for batch in train_data:
    x_train, y_train = batch
    print(x_train)
    print(y_train)
    break

tf.Tensor(
[[[18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  ...
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]]

 [[10  5 14 ... 23  2 26]
  [10  5 14 ...  2 14 19]
  [10  5 14 ...  2  2 11]
  ...
  [19 22 17 ...  8  4 14]
  [12  3  6 ... 12  6 18]
  [ 4 19 22 ...  9  0  0]]

 [[13  6 20 ... 11 11 11]
  [13  6 20 ... 11 11 11]
  [13  6 20 ... 11 11 11]
  ...
  [ 7 17  8 ... 11 11 11]
  [13  6 20 ... 11 11 11]
  [13  6 20 ... 11 11 11]]

 ...

 [[13 13 13 ... 11 11 25]
  [13 13 13 ... 11 11 25]
  [13 13 13 ... 11 11 25]
  ...
  [13 13 22 ... 11 11 25]
  [13 13 22 ... 11 11 25]
  [13 22 31 ... 11 11 25]]

 [[12  3  6 ... 12  6 18]
  [27 19 17 ...  8  4 14]
  [12  3  6 ... 12  6 18]
  ...
  [ 4 25 22 ...  9  0  0]
  [ 4 25 22 ...  9  0  0]
  [ 4 25 22 ...  9  0  0]]

 [[18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  ...
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]
  [18  2  4 ...  0  0  0]]]

In [21]:
centroids = np.zeros((train_data.element_spec[1].shape[1],   16))
print('centriods initialized:', centroids)
total_labels = np.zeros(train_data.element_spec[1].shape[1]) # it was 4
# total_labels[2] += 1
# total_labels[2] += 1
print('total_labels initialized:', total_labels)
for batch in train_data: # Remember <BatchDataset shapes: ((32, 32, 64), (32, 4)), types: (tf.int32, tf.float32)>
    logseq_batch, label_batch = batch
    # (32, 32, 64), (32, 4)
    features = log_classifier(logseq_batch, extract_feature=True )
    # (32, 16) features - 32 sequence of line each haaving 64 characrers
    # produces a feaure vector of dimension 16. 
    for i in range(len(label_batch)): # (32, 4) --> here length is 32
        label = label_batch[i] # label looks like [0 0 0 1]
        numeric_label = np.argmax(label) # index position of the label = 3 , so it is actually class =3
        ##total_labels = [0 0 0 0] each col representing a class 
        ## count the number for each class
        total_labels[numeric_label] += 1 
        centroids[numeric_label] += features[i] 
        # each row index in the centroid array is a class
        # we add first identify the feature belonging to which class by the numeric_label
        # Then add all the features belonging to the class in the corresponding row of the centroid array

### shape of centroids is (4, 16) whereas shape of total_labels is (1, 4)
### reshape the total_labels as 4,1 ==> [[0], [0], [0], [0]]==> 4 rows 
## so that we can divide the centroids array by the total_labels
total_label_reshaped = np.reshape(total_labels, (train_data.element_spec[1].shape[1], 1))
centroids /= total_label_reshaped
print('centroids:',centroids)
print('total_labels:',total_label_reshaped)

centriods initialized: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
total_labels initialized: [0. 0. 0. 0.]
centroids: [[4.36867584e+00 0.00000000e+00 1.49958603e+00 0.00000000e+00
  7.73302673e+00 8.73040199e-03 3.24169861e+00 8.64841537e-01
  4.71216797e+00 1.55114075e+00 0.00000000e+00 5.48450867e+00
  0.00000000e+00 0.00000000e+00 1.59659253e+01 0.00000000e+00]
 [1.51976456e+00 0.00000000e+00 5.45572205e+00 0.00000000e+00
  1.21205481e+01 2.54362701e+00 7.52497864e+00 3.48013092e+00
  2.38178558e+00 4.23300873e+00 0.00000000e+00 4.49523010e+00
  0.00000000e+00 0.00000000e+00 1.11317432e+01 0.00000000e+00]
 [5.36162710e-02 0.00000000e+00 4.23543930e-02 1.88338216e-04
  1.38310803e+01 3.65831970e+00 4.67911285e+00 2.11007148e-03
  1.42525558e-01 1.79507309e+00 0.00000000e+00 7.71791339e-03
  0.00000000e+00 0.00000000e+00 1.245

In [22]:
# to understand the np divide operation
n1 = np.arange(12)
n2 = n1.reshape((3, 4))
print('n2 is 3 rows and each row has a vector of 4 cols:\n', n2)
n3 = np.array([2, 2, 2])
print('n3, is one single row:\n ', n3)
n4 = np.reshape(n3, (3, 1))
print('one single row is now converted to three rows, n4:\n', n4)
print(f'now division between n2: {n2.shape} and n4: {n4.shape} is possible')
n2/n4

n2 is 3 rows and each row has a vector of 4 cols:
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
n3, is one single row:
  [2 2 2]
one single row is now converted to three rows, n4:
 [[2]
 [2]
 [2]]
now division between n2: (3, 4) and n4: (3, 1) is possible


array([[0. , 0.5, 1. , 1.5],
       [2. , 2.5, 3. , 3.5],
       [4. , 4.5, 5. , 5.5]])

In [23]:
#take zi and a ck 
# from sample_x_train the first sample belongs to class 3
print('sample_x_train[0]', sample_x_train[0])
sample_y_train = sample_train_data[1]
print('sample_y_train[0]', sample_y_train[0])
print('feature for the same:', features[0])
print('centroid for the class 3 :', centroids[3])

sample_x_train[0] tf.Tensor(
[[ 4 30 25 ...  9  0  0]
 [30 25 17 ...  8  4 14]
 [12  3  6 ... 12  6 18]
 ...
 [30 25 17 ...  8  4 14]
 [12  3  6 ... 12  6 18]
 [ 4 30 25 ...  9  0  0]], shape=(32, 64), dtype=int32)
sample_y_train[0] tf.Tensor([0. 0. 0. 1.], shape=(4,), dtype=float32)
feature for the same: tf.Tensor(
[ 2.4612103  0.         5.0018992  0.        10.821334   2.3988593
  7.61987    2.8431468  3.0025308  3.2754917  0.         7.239499
  0.         0.        13.771821   0.       ], shape=(16,), dtype=float32)
centroid for the class 3 : [ 1.61560287  0.          0.13224995  0.         15.29043823  1.29996536
  0.32455364  0.83443092  2.84516449  4.98930176  0.          0.16867764
  0.          0.         11.43109497  0.        ]


In [24]:
# eucladian distance 
#first sample belonging to class 3 = z_0_3, dimension of it is same as the dense neuron=16
z_0_3= features[0] # [16], earlier [2048]
C_3 = centroids[3] # [16], earlier [2048]
ED = np.sum(np.square(z_0_3 - C_3 ))
print('eucladian distance:', ED)
# InvalidArgumentError: Incompatible shapes: [32,64] vs. [2048] [Op:Sub]

eucladian distance: 161.30301


In [25]:
# lets f1 is one feature from a batch of 32
# instead of 16, lets say the dimension of f1 is 5
f1 = np.arange(20, 25)
# a1 = np.reshape(a1, (4, 3) )
print('one feature with dimension 5 , f1:\n', f1)
### lets say ctd is the centroids with 4 class , each row is a class 
ctd = np.arange(20)
ctd = np.reshape(ctd, (4, 5))
print('centroid, ctd with 4 class :\n', ctd)
sub_result = f1 - ctd
print()
print('sub_result, for each feature we have 4 rows after substraction:\n', sub_result)
print('the 4 rows of the result are the distance of the feature from the centroid of each class')
for i, row in enumerate(sub_result):
    print(f'distance of f1 from centroid of class_{i}: {row}')

one feature with dimension 5 , f1:
 [20 21 22 23 24]
centroid, ctd with 4 class :
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]

sub_result, for each feature we have 4 rows after substraction:
 [[20 20 20 20 20]
 [15 15 15 15 15]
 [10 10 10 10 10]
 [ 5  5  5  5  5]]
the 4 rows of the result are the distance of the feature from the centroid of each class
distance of f1 from centroid of class_0: [20 20 20 20 20]
distance of f1 from centroid of class_1: [15 15 15 15 15]
distance of f1 from centroid of class_2: [10 10 10 10 10]
distance of f1 from centroid of class_3: [5 5 5 5 5]


In [26]:
# to get the eucladian distance, calculate squared substraction
squared_substraction = np.square(sub_result)
print('squared_substraction:\n', squared_substraction)
#### eucladian distance of f1 from each class represented by a vector of 5 dimension
### those 5 values from the vector is summed up tp get a scalar value 
sum_squared_substraction = np.sum(np.square(sub_result), axis=1)
print('sum_squared_substraction:\n', sum_squared_substraction )

squared_substraction:
 [[400 400 400 400 400]
 [225 225 225 225 225]
 [100 100 100 100 100]
 [ 25  25  25  25  25]]
sum_squared_substraction:
 [2000 1125  500  125]


In [27]:
### the abobe example is for a single feature , how to do this for all the features
# can we substract centroids array from the entire feature array? 
# we can not substract different shaped arrays , see the error
print('features.shape', features.shape) # ## features.shape (32, 16)
print('centroids.shape', centroids.shape)  ##centroids.shape (4, 16)
# features - centroids # InvalidArgumentError: Incompatible shapes: [32,16] vs. [4,16] [Op:Sub]
#### substraction is elementwise substraction. so we both the array have to equal shape
z = np.expand_dims(features, axis=1) ## (32, 1, 16)
C =  np.expand_dims(centroids, axis=0) ### (1, 4, 16)
print('np.expand_dims(features, axis=1) :', z.shape)
print('np.expand_dims(centroids, axis=0):', C.shape)
# print('first dimenstion of C:',C[0])
# Now we can substract
sub_z_C = z - C
print('sub_z_C , for each feature 4 results:', sub_z_C.shape)
squred_sum = np.sum(np.square(z- C), axis=2)
print('squred_sum', squred_sum.shape)
print('Eucaldian distance of first feature from the 4 classes:\n',squred_sum[0])

features.shape (32, 16)
centroids.shape (4, 16)
np.expand_dims(features, axis=1) : (32, 1, 16)
np.expand_dims(centroids, axis=0): (1, 4, 16)
sub_z_C , for each feature 4 results: (32, 4, 16)
squred_sum (32, 4)
Eucaldian distance of first feature from the 4 classes:
 [ 68.02738834  19.01913043 122.15186871 161.30300406]


In [84]:
# from the data iteration done beforem we will have the last features and the last label_batch
## lets see what is the label for the 0th element in that batch
print('0th element from the last bath retrieved from data iteration previously:\n', features[0])
print('label for the feature_0:\n', label_batch[0])

0th element from the last bath retrieved from data iteration previously:
 tf.Tensor(
[15.753412   0.         2.146326   7.680573   0.         0.8314893
  0.         0.         2.2132406  0.         0.         0.
  4.68841    5.858222   0.         0.       ], shape=(16,), dtype=float32)
label for the feature_0:
 tf.Tensor([1. 0. 0. 0.], shape=(4,), dtype=float32)


In [None]:
# Here eucladian distance of the first feature is least from the class_0 

In [28]:
# make the dimensions same for substraction
def euclidean_metric(a, b):
    a = np.expand_dims(a, 1)
    b = np.expand_dims(b, 0)
#     logits = -((a - b)**2).sum(dim=2)
    logits = np.sum(-np.square(a - b), axis=2)
    return logits  

ED_logits = euclidean_metric(features, centroids)
print('ED_logits', ED_logits.shape)
print('ED_logits_sample', ED_logits[0])

ED_logits (32, 4)
ED_logits_sample [ -68.02738834  -19.01913043 -122.15186871 -161.30300406]


In [None]:
### we got the same result only in negative sign 
# these four eucladian values can be taken as softmax 
# to convert it as probability amonghst the four
# Then the max value ( that is why -ve sign will help)
# will represent the class with highest probability

In [29]:
t = tf.range(10)
t = tf.reshape(t, (2, 5))
print(t)
tf.argmax(t, axis=1)

tf.Tensor(
[[0 1 2 3 4]
 [5 6 7 8 9]], shape=(2, 5), dtype=int32)


<tf.Tensor: shape=(2,), dtype=int64, numpy=array([4, 4], dtype=int64)>

In [30]:
# can we get the max value instead of the index
tf.reduce_max(t, 1)

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 9])>

In [31]:
# smax = tf.nn.softmax(ED_logits, axis=1)
smax = tf.nn.softmax(ED_logits, )
print('smax.shape:', smax.shape)
class_idx_having_minimum_distance = tf.math.argmax(smax, axis=1)
for i in range(5):    
    print('smax_sample:', smax[i].numpy())
    print('class_idx_having_minimum_distance:', class_idx_having_minimum_distance[i].numpy())

smax.shape: (32, 4)
smax_sample: [5.19976862e-22 1.00000000e+00 1.62188800e-45 1.61027042e-62]
class_idx_having_minimum_distance: 1
smax_sample: [2.94122501e-22 1.00000000e+00 1.39221385e-45 5.15989714e-63]
class_idx_having_minimum_distance: 1
smax_sample: [1.00000000e+00 5.87663615e-49 5.58072923e-67 4.12563722e-70]
class_idx_having_minimum_distance: 0
smax_sample: [1.59162343e-55 2.89366148e-38 1.00000000e+00 1.45290469e-25]
class_idx_having_minimum_distance: 2
smax_sample: [1.60505512e-55 6.58368840e-38 1.00000000e+00 1.05650620e-25]
class_idx_having_minimum_distance: 2


In [32]:
print('label for the feature_0:\n', label_batch[0])
label_indexs = np.argmax(label_batch, axis=1)
print('label_indexs.shape', label_indexs.shape)
print('sample label_indexes',label_indexs[:5])
print('centroids.shape:', centroids.shape)
c = centroids[label_indexs]
print('c.shape:', c.shape)
print(c[0])

label for the feature_0:
 tf.Tensor([0. 1. 0. 0.], shape=(4,), dtype=float32)
label_indexs.shape (32,)
sample label_indexes [1 1 0 2 2]
centroids.shape: (4, 16)
c.shape: (32, 16)
[ 1.51976456  0.          5.45572205  0.         12.1205481   2.54362701
  7.52497864  3.48013092  2.38178558  4.23300873  0.          4.4952301
  0.          0.         11.13174316  0.        ]


In [34]:
### To understand this lets take simple example
a = np.arange(12).reshape((4, 3))
print('a:\n',a)
# print(a.shape)
print('a[0]:\n',a[0])
print()
# we create a index whose dimension is higher than a 
b = np.tile([0, 1, 2, 3], (4))
print('b.shape:\n',b.shape)
# b = 16 but a is having 4 rows
print('b:\n',b)
#Notice that each element of b is wihin 0-3 , matching with the max row of a.
# the slicing can be done now as:
print()
print(a[b])

a:
 [[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
a[0]:
 [0 1 2]

b.shape:
 (16,)
b:
 [0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]

[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]
 [ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]


In [40]:
# remeber c = centroids[label_indexs]
# only the centroid of the class corresponding to the feature
# earlier we had to expand dimension becuase centroid contained all the 
# four classes
dis = features - c 
print(dis.shape)
euc_dis = tf.norm(features - c, ord='euclidean', axis=1,)
print('euc_dis', euc_dis.shape)
for d, ed in zip(dis[:5], euc_dis[:5]):
    print(f'distance: {d.numpy()}\n eucladian distance:{ed.numpy()}\n')

(32, 16)
euc_dis (32,)
distance: [ 0.9414457   0.         -0.4538226   0.         -1.2992144  -0.14476776
  0.09489155 -0.6369841   0.6207452  -0.95751715  0.          2.744269
  0.          0.          2.6400776   0.        ]
 eucladian distance:4.3610920906066895

distance: [ 0.95464337  0.         -0.46437836  0.         -1.3628635  -0.07521749
  0.15102768 -0.6206374   0.6384938  -0.9985051   0.          2.7094555
  0.          0.          2.5778418   0.        ]
 eucladian distance:4.3346452713012695

distance: [ 0.5001459   0.          0.37751663  0.         -0.14249086 -0.0087304
  0.49801683  0.31941146 -0.04034901 -0.08043516  0.          0.66762257
  0.          0.          0.27619553  0.        ]
 eucladian distance:1.1371854543685913

distance: [ 4.93305624e-02  0.00000000e+00 -4.23543938e-02 -1.88338221e-04
 -1.24718475e+00  2.07755566e-01  1.19767666e-01 -2.11007148e-03
  1.24307394e-01 -8.01553786e-01  0.00000000e+00 -7.71791348e-03
  0.00000000e+00  0.00000000e+00 -5.66

In [72]:
# # torch.randn, Returns a tensor filled with random numbers from a normal distribution with mean 0 and variance 1 (also called the standard normal distribution).
# w_init = tf.random.normal([4],0, 1, tf.float32)
# print('w_init Outputs random values from a normal distribution.', w_init)
# w_init = tf.random.normal((4),0, 1, tf.float32)

### shape (4, 1 ) for delta  = 4 rows for the four classes
w_init = tf.random_normal_initializer()
w_init(shape=(4, 1), dtype='float32')

d = tf.Variable(
    initial_value=w_init(shape=(4, 1), dtype='float32'),
    trainable=True,
)
print('d', d)

# w_init = tf.random_normal_initializer()
# self.w = tf.Variable(
#     initial_value=w_init(shape=(input_shape[-1], self.units),
#                          dtype='float32'),
#     trainable=True)

d <tf.Variable 'Variable:0' shape=(4, 1) dtype=float32, numpy=
array([[-0.04433195],
       [-0.00878643],
       [ 0.00852845],
       [ 0.00794264]], dtype=float32)>


In [82]:
neg_mask = euc_dis < 1
print('neg_mask', neg_mask)
neg_mask = tf.dtypes.cast(neg_mask, tf.int32)
print('neg_mask', neg_mask)

neg_mask tf.Tensor(
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False], shape=(32,), dtype=bool)
neg_mask tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(32,), dtype=int32)


In [79]:
neg_loss = (d - euc_dis) 
# neg_loss = neg_loss * neg_mask
# InvalidArgumentError: cannot compute Mul as input #1(zero-based) was expected to be a float tensor but is a bool tensor [Op:Mul]

In [83]:
class BoundaryLoss(tf.keras.layers.Layer):
    def __init__(self, num_labels=train_data.element_spec[1].shape[1], 
                feat_dim = 16):
        super().__init__()
        self.num_labels = num_labels
        self.feat_dim = feat_dim
        # initializing the delta boundary (4,1 shape is for 4 classes 4 number of scaler value)
        w_init = tf.random_normal_initializer()
        self.delta = tf.Variable(
                            initial_value=w_init(shape=(4, 1), dtype='float32'),
                            trainable=True,
                        )
        
    def call(self, features, centroids, labels):
        logits =  euclidean_metric(features, centroids)  
        ######### Why softmax before softplus#########
        smax = tf.nn.softmax(logits, )
        # this is equivallent to predicting the feature belong to which class
        preds = tf.math.argmax(smax, axis=1)
        # This is equivallent to obtaining the max probabiliy of a feature belonging to a calss
        probs = tf.reduce_max(smax, 1)        
        ############################
        # delta =  log(1 + e ^ delta_k) , delta_k =self.delta = parameters for the boundary
        delta = tf.nn.softplus(self.delta)  
        label_indexs = np.argmax(label_batch, axis=1)
        # centroids are having only 4 rows , whereas labels are rows equivallent to batch
        # pick-up the centroid for each class 
        # label_index from the data set will have all the classes, 32 for a batch
        # for each class cetroid[class_index] will give the centroid of the calss
        # it is basically : [centroids[class_idx] for class_idx in label_indexes]
        c = centroids[label_indexs]
        # similarly get the delta for each class, 
        # although delta is now randomly intialized 
        # delta parameters will be learned through the training
        d = delta[label_indexs]
        x = features
        # x-c = vector of (32, 16) dimension , euc_dis  = scalar value
        euc_dis = tf.norm(x - c, ord='euclidean', axis=1)        
        ##If axis is None (the default), the input is considered a vector and a 
        ## single vector norm is computed over the entire set of values in the tensor, 
        ## i.e. norm(tensor, ord=ord) is equivalent to norm(reshape(tensor, [-1]), ord=ord). 
        ##If axis is a Python integer, the input is considered a batch of vectors, and axis determines the axis in tensor over which to compute vector norms.
        pos_mask = tf.dtypes.cast(euc_dis > d, tf.int32)
        neg_mask = tf.dtypes.cast(euc_dis < d, tf.int32)
        # euc_dis > d should be ==>1 and euc_dis <= d should be ==>0
        # but the expression here will it retrun True , False or 1 and 0. 
        pos_loss = (euc_dis - d) * pos_mask
        neg_loss = (d - euc_dis) * neg_mask
        loss = pos_loss.mean() + neg_loss.mean()
        
        return loss, delta

## Boundary Loss looks good - every line we have tested and found a logic 

In [None]:
# understand this , a= features(batch_size, 2048) , b = centroids (4, 2048)
def euclidean_metric_torch(a, b):
    n = a.shape[0]
    m = b.shape[0]
    a = a.unsqueeze(1).expand(n, m, -1)
    b = b.unsqueeze(0).expand(n, m, -1)
    logits = -((a - b)**2).sum(dim=2)
    return logits

In [81]:
import numpy as np
import tensorflow as tf
a = np.arange(6)
a = a.reshape((2, -1))
print('a:', a)
print('a.shape', a.shape)
b = np.arange(8, 16)
print('b',b)
b = np.reshape(b, (4, -1))
print('b',b)
print('b.shape:',b.shape)
tfa = tf.constant(a)
tfb = tf.constant(b)
print('tfa',tfa)
print('tfb',tfb)
# n = tfa.shape[0]
# m = b.shape[0]


a: [[0 1 2]
 [3 4 5]]
a.shape (2, 3)
b [ 8  9 10 11 12 13 14 15]
b [[ 8  9]
 [10 11]
 [12 13]
 [14 15]]
b.shape: (4, 2)
tfa tf.Tensor(
[[0 1 2]
 [3 4 5]], shape=(2, 3), dtype=int32)
tfb tf.Tensor(
[[ 8  9]
 [10 11]
 [12 13]
 [14 15]], shape=(4, 2), dtype=int32)


In [78]:
print('tf.expand_dims(tfa, 0) :',tf.expand_dims(tfa, 0))
print()
print('tf.expand_dims(tfa, 1) :',tf.expand_dims(tfa, 1))
print()
print('tf.expand_dims(tfa, 1) :',tf.expand_dims(tfa, -1))

tf.expand_dims(tfa, 0) : tf.Tensor(
[[[0 1 2]
  [3 4 5]]], shape=(1, 2, 3), dtype=int32)

tf.expand_dims(tfa, 1) : tf.Tensor(
[[[0 1 2]]

 [[3 4 5]]], shape=(2, 1, 3), dtype=int32)

tf.expand_dims(tfa, 1) : tf.Tensor(
[[[0]
  [1]
  [2]]

 [[3]
  [4]
  [5]]], shape=(2, 3, 1), dtype=int32)


In [82]:
tfa = tf.expand_dims(tfa, 1)
print(f'tf.shape(tfa): {tf.shape(tfa)}')
tfb = tf.expand_dims(tfb, 0)
print(f'tf.shape(tfb): {tf.shape(tfb)}')

tf.shape(tfa): [2 1 3]
tf.shape(tfb): [1 4 2]


In [86]:
n = a.shape[0]
m = b.shape[0]
tfa_broadcast = tf.broadcast_to(tfa, [2, 4, 3])
tf.shape(tfa_broadcast)
print('tfa_broadcast',tfa_broadcast)

tfa_broadcast tf.Tensor(
[[[0 1 2]
  [0 1 2]
  [0 1 2]
  [0 1 2]]

 [[3 4 5]
  [3 4 5]
  [3 4 5]
  [3 4 5]]], shape=(2, 4, 3), dtype=int32)


In [None]:
# 6

# The equivalent function for pytorch expand is tensorflow tf.broadcast_to

# Docs: https://www.tensorflow.org/api_docs/python/tf/broadcast_to

# Share
# Follow
# edited Oct 23, 2021 at 18:22

# M.Innat
# 12.2k66 gold badges3434 silver badges6767 bronze badges
# answered Jan 4, 2019 at 9:12

# funkyyyyyy
# 6111 silver badge22 bronze badges
# Add a comment

# 0

# Tensorflow automatically broadcasts, so in general you don't need to do any of this. Suppose you have a y' of shape 6x2x3 and your x is of shape 2x3, then you can already do y'*x or y'+x will already behave as if you had expanded it. But if for some other reason you really need to do it, then the command in tensorflow is tile:

# y = tf.tile(tf.reshape(x, (1,2,3)), multiples=(6,1,1))
# Docs: https://www.tensorflow.org/api_docs/python/tf/tile

In [None]:
def euclidean_metric(a, b):
    n = a.shape[0]
    m = b.shape[0]
    a = tf.expand_dims(a, 1)
    b = tf.expand_dims(b, 0)
    logits = -((a - b)**2).sum(dim=2)
    return logits  

In [87]:
class OpenSet:
    def __init__(self, data, pretrained_model):
        
        self.model = pretrained_model
        self.best_eval_score = 0
        self.delta = None
        self.delta_points = []
        self.centroids = None
        self.test_results = None
        self.predictions = None
        self.true_labels = None
        
    def centroids_cal(self):
        centriods = np.zeros(train_data.element_spec[1].shape[1], embedding_size)
        total_labels = np.empty(0, dtype=longdouble)
        


In [None]:
# Customizing the training step to get centroid for each class
class OpenSet:
    def __init__(self, data, pretrained_model=log_classifier):
#         super().__init__():
        self.model = pretrained_model        
        self.centroids = None
        self.num_labels = train_data.element_spec[1].shape[1]
        
    def centroids_cal(self):
        centriods = np.zeros(self.num_labels, embedding_size)
        total_labels = np.empty(0, dtype=longdouble)
        for batch in data:
            logseq_batch, label_batch = batch
            features = self.model(logseq_batch, extract_feature=True ) 
            

In [None]:
# In context of deep learning the logits layer means the layer that feeds in to softmax (or other such normalization). The output of the softmax are the probabilities for the classification task and its input is logits layer. The logits layer typically produces values from -infinity to +infinity and the softmax layer transforms it to values from 0 to 1.

# Historical Context

# Where does this term comes from? In 1930s and 40s, several people were trying to adapt linear regression to the problem of predicting probabilities. However linear regression produces output from -infinity to +infinity while for probabilities our desired output is 0 to 1. One way to do this is by somehow mapping the probabilities 0 to 1 to -infinity to +infinity and then use linear regression as usual. One such mapping is cumulative normal distribution that was used by Chester Ittner Bliss in 1934 and he called this "probit" model, short for "probability unit". However this function is computationally expensive while lacking some of the desirable properties for multi-class classification. In 1944 Joseph Berkson used the function log(p/(1-p)) to do this mapping and called it logit, short for "logistic unit". The term logistic regression derived from this as well.

# The Confusion

# Unfortunately the term logits is abused in deep learning. From pure mathematical perspective logit is a function that performs above mapping. In deep learning people started calling the layer "logits layer" that feeds in to logit function. Then people started calling the output values of this layer "logit" creating the confusion with logit the function.